In [None]:
#importing the libraries
import pandas as pd
import numpy as np

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('../input/data-science-london-scikit-learn/train.csv',header=None)
test=pd.read_csv('../input/data-science-london-scikit-learn/test.csv',header=None)
trainLabel=pd.read_csv('../input/data-science-london-scikit-learn/trainLabels.csv',header=None,names=['target'])

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)
print('trainLabel shape:', trainLabel.shape)
train.head(10)

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
X,y=train,np.ravel(trainLabel)

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,random_state=100,test_size=0.2)

In [None]:
algorithm=['auto', 'ball_tree', 'kd_tree', 'brute']
weights=['uniform','distance']
neig=range(1,20)
train_accuracy=[]
val_accuracy=[]
best_score=0.0
best_knn=None

for k in neig:
    KNN=KNeighborsClassifier(n_neighbors=k,algorithm='auto',weights='uniform')
    KNN.fit(X_train,y_train)
    y_pred=KNN.predict(X_val)
    train_score=KNN.score(X_train,y_train)
    val_score=accuracy_score(y_val,y_pred)
    # we can append accuracy in lists
    train_accuracy.append(train_score)
    val_accuracy.append(val_score)
    
    #we can save best accurcy in best_score
    if val_score > best_score :
        best_score=val_score
        best_knn=KNN

#we can plot the graph to show number of neighbors with accuracy
plt.figure(figsize=(10,10))
plt.plot(neig,train_accuracy,c='blue',label='train accuracy')
plt.plot(neig,val_accuracy,c='red',label='val accuracy')
plt.legend()
plt.title('number of neighbors with accuracy')
plt.xlabel('n _neighbors')
plt.ylabel('Accuracy')

print('train score : ',best_knn.score(X_train,y_train))
print('val score : ',best_score)
print(best_knn)

In [None]:
RandomForesClassifiertModel=RandomForestClassifier(random_state=100)
estimator=[20,50,70,100]
max_depth=[20,30,40,60]
split=[5,10,15]
param=dict(n_estimators=estimator,max_depth=max_depth,min_samples_split=split)
RandomForestCV=GridSearchCV(estimator=RandomForesClassifiertModel,param_grid=param,cv=6,n_jobs=-1)
RandomForestCV.fit(X_train,y_train)
y_pred=RandomForestCV.predict(X_val)
print(RandomForestCV.best_params_)
print('score train : ',RandomForestCV.score(X_train,y_train))
print('score test  : ',accuracy_score(y_pred,y_val))

In [None]:
kernel = ['linear','poly','rbf','sigmoid','precomputed']
SVCModel=SVC(kernel='rbf',max_iter=1000,C=0.1)
SVCModel.fit(X_train,y_train)
y_pred=SVCModel.predict(X_val)
print('score train : ',SVCModel.score(X_train,y_train))
print('score test  : ',accuracy_score(y_val,y_pred))

 we can applay GAUSSIAN MIXTURE MODEL 

In [None]:

print('X shape :',X.shape)
print('\n')

# USING THE GAUSSIAN MIXTURE MODEL 

#The Bayesian information criterion (BIC) can be used to select the number of components in a Gaussian Mixture in an efficient way. 
#In theory, it recovers the true number of components only in the asymptotic regime
# aic and bic The lower the better.

lowest_bic = np.infty
bic = []

#The GaussianMixture comes with different options to constrain the covariance of the difference classes estimated: 
# spherical, diagonal, tied or full covariance.

cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in range(1,7):
        gmm = GaussianMixture(n_components=n_components,covariance_type=cv_type)
        gmm.fit(X)
        bic.append(gmm.aic(X))
        if bic[-1] < lowest_bic:
            lowest_bic=bic[-1]
            best_gmm=gmm
                    
best_gmm.fit(X)
gmm_train = best_gmm.predict_proba(X_train)
gmm_val = best_gmm.predict_proba(X_val)
gmm_test=best_gmm.predict_proba(test)
best_gmm
print(gmm.aic(X))

In [None]:
print("gmm_test",gmm_val.shape)
print("gmm_train",gmm_train.shape)
print("X_train",X_train.shape)
print("x_val",X_val.shape)

# now we can apply Support Vector Classifier Model

In [None]:
kernel = ['linear','poly','rbf','sigmoid','precomputed']
SVCModel=SVC(kernel='rbf',max_iter=1000,C=0.1)
SVCModel.fit(gmm_train,y_train)
y_pred=SVCModel.predict(gmm_val)
print('score train : ',SVCModel.score(gmm_train,y_train))
print('score test  : ',accuracy_score(y_pred,y_val))


# let's go to apply KNeighborsClassifier Model

In [None]:
algorithm=['auto', 'ball_tree', 'kd_tree', 'brute']
weights=['uniform','distance']
neig=range(1,20)
train_accuracy=[]
val_accuracy=[]
best_score=0.0
best_knn=None

for k in neig:
    KNN=KNeighborsClassifier(n_neighbors=k,algorithm='auto',weights='uniform')
    KNN.fit(gmm_train,y_train)
    y_pred=KNN.predict(gmm_val)
    train_score=KNN.score(gmm_train,y_train)
    val_score=accuracy_score(y_val,y_pred)
    # we can append accuracy in lists
    train_accuracy.append(train_score)
    val_accuracy.append(val_score)
    
    #we can save best accurcy in best_score
    if val_score > best_score :
        best_score=val_score
        best_knn=KNN

#we can plot the graph to show number of neighbors with accuracy
plt.figure(figsize=(10,10))
plt.plot(neig,train_accuracy,c='blue',label='train accuracy')
plt.plot(neig,val_accuracy,c='red',label='val accuracy')
plt.legend()
plt.title('number of neighbors with accuracy')
plt.xlabel('n _neighbors')
plt.ylabel('Accuracy')

print('train score : ',best_knn.score(gmm_train,y_train))
print('val score : ',accuracy_score(y_pred,y_val))


print(best_knn)

# we can apply Random Forest Classifier Model

In [None]:
RandomForesClassifiertModel=RandomForestClassifier(random_state=100)
estimator=[20,50,70,100]
max_depth=[20,30,40,60]
split=[5,10,15]
param=dict(n_estimators=estimator,max_depth=max_depth,min_samples_split=split)
RandomForestCV=GridSearchCV(estimator=RandomForesClassifiertModel,param_grid=param,cv=6,n_jobs=-1)
RandomForestCV.fit(gmm_train,y_train)
y_pred=RandomForestCV.predict(gmm_val)
print(RandomForestCV.best_params_)
print('score train : ',RandomForestCV.score(gmm_train,y_train))
print('score test  : ',accuracy_score(y_pred,y_val))
