In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import collections
from collections import Counter as count

# *Data Ingestion :*

In [None]:
data= pd.read_csv("../input/uci-turkiye-student-evaluation-data-set/turkiye-student-evaluation_generic.csv")
data.head(20)

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.shape

### *Null Values :*

In [None]:
data.isnull().sum()

**There are no null values in a given dataset.**

### *Descriptive Statistics :*

In [None]:
data.describe()

**Inference :**
* Descrpitive statistics of columns difficulty, Q1, Q2....Q28 are almost same.
* Mean and Median of class column approximately same.


In [None]:
data.shape

# *Exploratory Data Analysis :*

In [None]:
sns.countplot(x='class',data=data)
# sns.pairplot(data)
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data=data.iloc[:,6:])
plt.show()

**It was observed that Q14,Q15,Q17,Q19:Q22 and Q25 questions with good rating.**

# *Scaling :*

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
data = pd.DataFrame(sc.fit_transform(data),columns=data.columns)

In [None]:
data

# *K Means Clustering :*

In [None]:
from sklearn.cluster import KMeans

In [None]:
cluster_range = range(1,20)
cluster_errors = []
for num_cluster in cluster_range:
    clusters = KMeans(num_cluster)
    clusters.fit(data)
    cluster_errors.append(clusters.inertia_) 

In [None]:
pd.DataFrame({'No of Clusters':cluster_range, 'Cluster Error':cluster_errors})

### *Elbow Plot :* 

In [None]:
plt.figure(figsize=(15,5))
plt.plot(cluster_range,cluster_errors,marker = 'o')
plt.title('Elbow Plot')
plt.xlabel('Number of Clusters')
plt.ylabel('Error of Clusters')
plt.xticks(cluster_range)
plt.show()

**Based on the elbow graph we can go for 3 clusters.**

In [None]:
kmeans = KMeans(n_clusters=3)
y_kmeans = kmeans.fit_predict(data)

In [None]:
y_kmeans

In [None]:
k_clusters = count(y_kmeans)
k_clusters

**Above count was the count of 3 clusters.**

# *Hierarchical Clustering :*

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
plt.figure(figsize=(20,10))
Z = linkage(data, method='ward')
dendrogram(Z, leaf_rotation=90, p=10, truncate_mode='level', leaf_font_size=6, color_threshold=8)
plt.title('Dendogram')
plt.show()

**By the Dendogram we can see that there are 3 optimal number of clusters.**

**Now fit Hierarchical clustering to the data**

## *Agglomerative Clustering :*
#### it is Hierarchical clustering algorithm

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean')

In [None]:
ac.fit(data)

In [None]:
ac.labels_ ## clusters

In [None]:
h_clusters = count(ac.labels_)
h_clusters

In [None]:
k_clusters

In [None]:
h_clusters

In [None]:
clusters = ['Kmean','Hierarchical']
pd.DataFrame({'K_Clusters':k_clusters, 'Hierarchical':h_clusters})

**Inference :**
* From the above dataframe we can compare the clusters of both the algorithmns.
* Third cluster number from both methods was almost close.

# *Convert Unsupervised data into Supervised data :*

In [None]:
df=data.copy()

In [None]:
kmeans = KMeans(n_clusters=3, max_iter=100)

In [None]:
kmeans.fit(df)

In [None]:
count(kmeans.labels_) # clusters

In [None]:
df['label'] = kmeans.labels_

In [None]:
df.head(10)

In [None]:
# no outlier 
df.label.plot(kind='box')
# sns.boxplot(x='label',data=df)
plt.show()

In [None]:
df.label.value_counts().plot(kind='bar')
plt.show()

In [None]:
df['label'].value_counts()

In [None]:
sns.pairplot(df,hue='label')
plt.show()

# *PCA :*

**Since the data is already scaled , now apllying PCA fro dimensionality reduction :**
#### only apply on features 

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()

In [None]:
data_pca = pca.fit_transform(data)

In [None]:
data_pca.shape

In [None]:
pca.components_

In [None]:
# np.cumsum is used to calculate the accumulative sum of array
pca.explained_variance_ratio_ 
# The pca.explained_variance_ratio_ parameter returns a vector of the variance explained by each dimension.

In [None]:
cumsum=np.cumsum(pca.explained_variance_ratio_)
cumsum

In [None]:
plt.figure(figsize=(10,6))

plt.plot(range(1,34), cumsum, color='k', lw=2)

plt.xlabel('Number of components')
plt.ylabel('Total explained variance')

plt.axvline(8, c='b')
plt.axhline(0.9, c='r')

plt.show()

**90 percent of data variance consists 8 components.**
#### if i chosse 8 columns than i save 90% variance

In [None]:
pca = PCA(n_components=8)
pca.fit(data)
data_pca = pd.DataFrame(pca.transform(data))
data_pca.shape

In [None]:
data_pca.head(10)

In [None]:
# In statistics, kernel density estimation is a non-parametric 
# way to estimate the probability density function of a random variable. 

sns.pairplot(data_pca, diag_kind='kde')
plt.show()

### *Kmeans Clustering :*

In [None]:
cluster_range = range(1,16)
cluster_errors = []

for num_clusters in cluster_range:
    clusters = KMeans(num_clusters, n_init=10, max_iter=100)
    clusters.fit(data_pca)
    
    cluster_errors.append(clusters.inertia_)
    
pd.DataFrame({'num_clusters':cluster_range, 'Error': cluster_errors})

**Elbow Plot :**

In [None]:
plt.figure(figsize=(10,5))
plt.plot(cluster_range, cluster_errors, marker = "o" )
plt.title('Elbow Plot')
plt.xlabel('Number of Clusters')
plt.ylabel('Error')
plt.xticks(cluster_range)
plt.show()

In [None]:
pca_df = data_pca.copy()
kmeans = KMeans(3, n_init=10, max_iter=100)
kmeans.fit(pca_df)
pca_df['label'] = kmeans.labels_
pca_df['label'].value_counts()

### *Agglomerative Clustering :*

In [None]:
plt.figure(figsize=(20,10))
link = linkage(data_pca, method='ward')
dendrogram(link, leaf_rotation=90, p=10, truncate_mode='level', leaf_font_size=6, color_threshold=8)
plt.title('Dendogram')
plt.show()

**From the above dendogram we can see 3 clusters.**

In [None]:
ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean',  linkage='ward')
ac.fit(data_pca)

In [None]:
y_ac=ac.fit_predict(data_pca)

In [None]:
count(y_ac)

In [None]:
first0=[2226,2756]
second1=[1231,2379]
third2=[2363,685]
clusters=['Kmeans','Agglm Cluster']
d=pd.DataFrame({'Clusters':clusters,'FirstC':first0,'SecondC':second1,'ThirdC':third2})
d

**Inference :**
* First cluster is some what nearer in both the methods.

# *Splitting the data before PCA :*

In [None]:
df.head()

In [None]:
X=df.drop(columns='label')
y=df['label']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)

### *Logistic Regression :*

In [None]:
from sklearn import metrics

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(Xtrain, ytrain)

In [None]:
print('Training score =', lr.score(Xtrain, ytrain))
print('Test score =', lr.score(Xtest, ytest))

In [None]:
ypred1=lr.predict(Xtest)

In [None]:
acc1=(metrics.accuracy_score(ytest,ypred1))
acc1

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, ypred1)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

**Model is good fit.**

### *Decision Tree Classifier :*

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(Xtrain, ytrain)

print('Training score =', dt.score(Xtrain, ytrain))
print('Test score =', dt.score(Xtest, ytest))

In [None]:
ypred2=dt.predict(Xtest)

In [None]:
acc2=(metrics.accuracy_score(ytest,ypred2))
acc2

In [None]:
cm = confusion_matrix(ytest, ypred2)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

**Model is under fit.**

### *KNN :*

In [None]:
from sklearn.neighbors import KNeighborsClassifier

score=[]
for k in range(1,100):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(Xtrain, ytrain)
    ypred3=knn.predict(Xtest)
    accuracy=metrics.accuracy_score(ypred3,ytest)
    score.append(accuracy*100)
    print (k,': ',accuracy)

In [None]:
score.index(max(score))+1

In [None]:
round(max(score))

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(Xtrain, ytrain)

print('Training score =', knn.score(Xtrain, ytrain))
print('Test score =', knn.score(Xtest, ytest))

**Model is good fit:**

### *Naive Bayes :*

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(Xtrain, ytrain)

print('Training score =', gnb.score(Xtrain, ytrain))
print('Test score =', gnb.score(Xtest, ytest))

**Model is Best Fit.**

In [None]:
Algorithm=['LogisticRegression','Decision Tree','KNN','Naive Bayes']
Train_Accuracy=[0.985,1.00,0.977,0.988]
Test_Accuracy=[0.975,0.939,0.963,0.988]

In [None]:
Before_PCA = pd.DataFrame({'Algorithm': Algorithm,'Train_Accuracy': Train_Accuracy,'Test_Accuracy':Test_Accuracy})
Before_PCA

**Inference :**
* Naive Bayes algorithm has performed well with an accuracy 0f 98.8 percent.
* Decision Tree has not performed well and it is under fit.

# *Splitting the data after PCA :*

In [None]:
df1=data_pca.copy()

In [None]:
kmeans = KMeans(3, n_init=5, max_iter=100)
kmeans.fit(df1)
df1['label'] = kmeans.labels_
df1.head()

In [None]:
X1=df1.drop(columns='label')
y1=df1['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### *Logistic Regression :*

In [None]:
lr_pca = LogisticRegression()
lr_pca.fit(X_train, y_train)
print('Training score =', lr_pca.score(X_train, y_train))
print('Test score =', lr_pca.score(X_test, y_test))

**Model is good fit.**

### *Decision Tree Classifier :*

In [None]:
dt_pca = DecisionTreeClassifier()
dt_pca.fit(X_train, y_train)
print('Training score =', dt_pca.score(X_train, y_train))
print('Test score =', dt_pca.score(X_test, y_test))

**Model is somewhat underfit.**

### *KNN :*

In [None]:
score=[]
for k in range(1,100):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    ypred=knn.predict(X_test)
    accuracy=metrics.accuracy_score(ypred,y_test)
    score.append(accuracy*100)
    print (k,': ',accuracy)

In [None]:
score.index(max(score))+1

In [None]:
(max(score))

In [None]:
knn_pca = KNeighborsClassifier(n_neighbors=7)
knn_pca.fit(X_train, y_train)

print('Training score =', knn_pca.score(X_train, y_train))
print('Test score =', knn_pca.score(X_test, y_test))

**Model is good fit.**

### *Naive Bayes :*

In [None]:
gnb_pca = GaussianNB()
gnb_pca.fit(X_train, y_train)
print('Training score =', gnb_pca.score(X_train, y_train))
print('Test score =', gnb_pca.score(X_test, y_test))

**Model is good fit.**

In [None]:
Algorithm=['LogisticRegression','Decision Tree','KNN','Naive Bayes']
Train_Accuracy=[0.987,1.00,0.987,0.975]
Test_Accuracy=[0.979,0.995,0.980,0.967]

In [None]:
After_PCA = pd.DataFrame({'Algorithm': Algorithm,'Train_Accuracy': Train_Accuracy,'Test_Accuracy':Test_Accuracy})
After_PCA

**Inference :**
* All the models performed well.
* Decision Tree has 100% on training and 99.5% on testing.

# *Final Model :*

In [None]:
Algorithm=['LR BPCA','DT BPCA','KNN BPCA','NB BPCA','LR APCA','DT APCA','KNN APCA','NB APCA']
Train_Accuracy=[0.985,1.00,0.977,0.988,0.987,1.00,0.987,0.975]
Test_Accuracy=[0.975,0.939,0.963,0.988,0.979,0.995,0.980,0.967]

In [None]:
Final = pd.DataFrame({'Algorithm': Algorithm,'Train_Accuracy': Train_Accuracy,'Test_Accuracy':Test_Accuracy})
Final

In [None]:
plt.subplots(figsize=(15,6))
sns.lineplot(x="Algorithm", y="Train_Accuracy",data=Final,palette='hot',label='Train Accuracy')
sns.lineplot(x="Algorithm", y="Test_Accuracy",data=Final,palette='hot',label='Test Accuracy')

plt.xticks(rotation=90)
plt.title('MLA Accuracy Comparison')
plt.legend()
plt.show()

**Inference :**
* Naive Bayes before PCA  performed well.
* Logistic Regression after PCA performed well.
* Naive Bayes(Before PCA) is the best model from all the model where training and testing sores are equal.