In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pwd

### Importing the required libraries 

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# 1- Reading the trainin and testing datasets

In [None]:
train=pd.read_csv('/kaggle/input/data-science-london-scikit-learn/train.csv', sep=',', header=None)
test=pd.read_csv('/kaggle/input/data-science-london-scikit-learn/test.csv', sep=',', header=None)
train_label=pd.read_csv('/kaggle/input/data-science-london-scikit-learn/trainLabels.csv', header=None)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
test.describe().transpose()

In [None]:
train.describe().transpose()

#### Checking if there is any null or NaN value in both training and testing sets

In [None]:
print('Train: Does NaN values exist',train.isna().values.any())
print('Train: Does Null values exist',train.isnull().values.any())
print('Test: Does NaN values exist',test.isna().values.any())
print('Test: Does NaN exists',test.isnull().values.any())

##### Counting the number of classes ==> To check if classes are well balanced

In [None]:
class_0=np.where(train_label==0)
len(class_0[0])
class_1=np.where(train_label==1)
len(class_1[0])

print('Number of samples for class 0:',len(class_0[0]) )
print('Number of samples for class 1:',len(class_1[0]))

# 2- Splitting the total dataset into training and testing 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_label, test_size=0.2, random_state=30)

##### Printing the dimensions of the datasets 

In [None]:
print('X train dimension:',X_train.shape)
print('X test dimension :', X_test.shape)

# 3- Data Normalisation and Standardization
## 3.1- Data Normalisation 

The Normalization and the Standardization are called feature scaling techniques. Normalization is also known under the name Min-Max scaling. This technique consists of shifting and rescaling the values, using the minimum and the maximum values of the training dataset, so that they end up ranging between 0 and 1. 

Standardization is another scaling technique that consists of scaling the values using the mean and standard deviation of the training dataset. All the values must be centered around the mean value which is null and the standard deviation is 1. 

In [None]:
normalizer = MinMaxScaler()
normalized_train_X = normalizer.fit_transform(X_train)### we fit the normalizer on training data 
### Testing 
normalized_test_X = normalizer.transform(X_test)


## 3.2- Data Standardization 

In [None]:
scaler = StandardScaler()
scaled_train_X = scaler.fit_transform(X_train)### we fit the normalizer on training data 
### Testing 
scaled_test_X = scaler.transform(X_test)


# 4- Application of our algorithm 
## 4.1-Decision tree 
### a. Before data Normalization and Standardisation 

Decision tree is the most powerful and popular algorithm for classification and prediction. It is named decision tree because it builds a structure like the tree. It starts by splitting the total dataset into two subsets based on the most important feature. The most important feature is selected after considering a classification criterion.  

This process is repeated on each derived subset in a recursive manner called recursive partitioning until all the last nodes are pure means all of its data belongs to a single class. The construction of decision tree classifier does not require any domain knowledge or parameter setting, and therefore is appropriate for exploratory knowledge discovery.  In general decision tree classifier has good accuracy.

In [None]:
clf=tree.DecisionTreeClassifier(criterion="gini",random_state = 42) # implementation of the hierarchical/decision trees
clf.fit(X_train, y_train)
y_predDT=clf.predict(X_test)
# The score method returns the accuracy of the model
accuracy_score(y_test,y_predDT)

In [None]:
print('confusion matrix:')
# confusion matrix 
print(confusion_matrix(y_test,y_predDT))

In [None]:
print('Classification report:')
print(classification_report(y_test, y_predDT))

### b. After data Normalization

In [None]:
clf=tree.DecisionTreeClassifier(criterion="gini",random_state = 42) # implementation of the hierarchical/decision trees
clf.fit(normalized_train_X, y_train)
y_predDT1=clf.predict(normalized_test_X)
# The score method returns the accuracy of the model
accuracy_score(y_test,y_predDT1)

### c. After data Standardisation

In [None]:
clf=tree.DecisionTreeClassifier(criterion="gini",random_state = 42) # implementation of the hierarchical/decision trees
clf.fit(scaled_train_X, y_train)
y_predDT2=clf.predict(scaled_test_X)
# The score method returns the accuracy of the model
accuracy_score(y_test,y_predDT2)

## 4.2-SVM: Support Vector Machine 
### a. Before data Normalization and Standardisation

It is a supervised machine learning algorithm that can be used for both classification or regression tasks but it is most adopted in classification problems. 

The linear SVM classifier works by drawing a straight line between two classes. It mainly consists of finding the hyperplane equation that distinctly classifies data points or observations.

Our goal is to find a plane that has the maximum margin or maximum distance between the data points of both classes.

In [None]:
svclassifier=SVC(kernel='rbf',random_state=30, gamma='auto')
# train the classifier
svclassifier.fit(X_train,y_train)
y_predSVM=svclassifier.predict(X_test)
svclassifier.score(X_test, y_test)
accuracy_score(y_test, y_predSVM)

In [None]:
print('confusion matrix:')
# confusion matrix 
print(confusion_matrix(y_test,y_predSVM))

In [None]:
print('Classification report:')
print(classification_report(y_test, y_predSVM))

### b. After data Normalization

In [None]:
svclassifier=SVC(kernel='rbf',random_state=30, gamma='auto').fit(normalized_train_X,y_train)
y_predSVM1=svclassifier.predict(normalized_test_X)
accuracy_score(y_test, y_predSVM1)

### c. After data Standardisation 

In [None]:
svclassifier=SVC(kernel='rbf',random_state=30, gamma='auto').fit(scaled_train_X,y_train)
y_predSVM2=svclassifier.predict(scaled_test_X)
accuracy_score(y_test, y_predSVM2)

## 4.3- Logistic regression
### a. Before data Normalization and Standardisation

Logistic regression is a widely used method in machine learning to solve binary classification problems where two classes are predicted. It is called Logistic Regression because it is based on the logistic function or sigmoid function. The only difference from linear regression is that here our output is 0 or 1 while in linear regression it is a continuous value

In [None]:
LR = LogisticRegression(solver='sag').fit(X_train, y_train)
y_predLR=LR.predict(X_test)
accuracy_score(y_test, y_predLR)

In [None]:
print('confusion matrix:')
# confusion matrix 
print(confusion_matrix(y_test,y_predLR))

print('Classification report:')
print(classification_report(y_test, y_predLR))

### b. After data Normalization

In [None]:
LR1 = LogisticRegression(solver='sag').fit(normalized_train_X, y_train)
y_predLR1=LR1.predict(normalized_test_X)
accuracy_score(y_test, y_predLR1)

### c. After data Standardisation

In [None]:
LR2 = LogisticRegression(solver='sag').fit(scaled_train_X, y_train)
y_predLR2=LR2.predict(scaled_test_X)
accuracy_score(y_test, y_predLR2)

## 4.4- K Nearest Neighbors
### a. Before data Normalization and Standardisation

It is a supervised machine learning algorithm that can used to solve only classification tasks. This algorithm stores all available cases and classifies new cases based on a similarity measure (e.g., distance functions). The distance could be Euclidean, Manhattan or MinKowski. It is “non-parametric” (only k needs to be set) and is based only on training data.



In [None]:
knn = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train.values.ravel())
pred_KNN = knn.predict(X_test)
print('KNN',accuracy_score(y_test, pred_KNN))

In [None]:
print('confusion matrix:')
# confusion matrix 
print(confusion_matrix(y_test,pred_KNN))

print('Classification report:')
print(classification_report(y_test, pred_KNN))

### b. After data Normalization

In [None]:
knn1 = KNeighborsClassifier(n_neighbors=7).fit(normalized_train_X, y_train.values.ravel())
pred_KNN1 = knn.predict(normalized_test_X)
print('KNN',accuracy_score(y_test, pred_KNN1))

### c. After data Standardisation

In [None]:
knn2 = KNeighborsClassifier(n_neighbors=7).fit(scaled_train_X, y_train.values.ravel())
pred_KNN2 = knn.predict(scaled_test_X)
print('KNN',accuracy_score(y_test, pred_KNN2))

## 4.5- Random Forest 
### a. Before data Normalization and Standardisation

Random Forest is a supervised learning algorithm. It is one of the most adopted algorithms that could be used to solve regression or classification tasks. It builds a 'forest' which is an ensemble of decision trees usually trained with the 'bagging' method. 

Random Forest builds multiple decision trees and merges them together to get a more accurate and stable prediction. In addition, instead of searching for the most important feature while splitting a node, it searches for the best feature among a random subset of features. 

In [None]:
RF = RandomForestClassifier(criterion='gini',n_estimators=75,random_state =42)
RF.fit(X_train, y_train.values.ravel())
pred_RF = RF.predict(X_test)
print('RF',accuracy_score(pred_RF,y_test))

### b. After data Normalization

In [None]:
RF1 = RandomForestClassifier(criterion='gini',n_estimators=75,random_state =42).fit(normalized_train_X, y_train.values.ravel())
pred_RF1 = RF1.predict(normalized_test_X)
print('RF',accuracy_score(pred_RF1,y_test))

### c. After data Standardisation

In [None]:
RF2 = RandomForestClassifier(criterion='gini',n_estimators=75,random_state =42).fit(scaled_train_X, y_train.values.ravel())
pred_RF2 = RF2.predict(scaled_test_X)
print('RF',accuracy_score(pred_RF2,y_test))

We had noticed that data normalization or standardization does not improve the result so we can avoid it. 

It is clear that the best applied algorithm were KNN and SVM with accuracy equal to 91% and 90.5%, respectively. 

In [None]:
prediction=knn.predict(test)
mysubmission = pd.DataFrame(prediction)

mysubmission.index += 1

mysubmission.columns = ['Solution']
mysubmission['Id'] = np.arange(1,mysubmission.shape[0]+1)
mysubmission = mysubmission[['Id', 'Solution']]

mysubmission.to_csv('Submission.csv',index=False)
