In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow import keras

## Section 01:- Reading Data and Checking the Meta information

In [None]:
df = pd.read_csv('../input/social-network-ads/Social_Network_Ads.csv')
df.info()
df.describe()

## Section02- EDA 

### Checking the correlation

In [None]:
sns.heatmap(df.corr(), cmap="YlGnBu", annot = True)
plt.show()

### Checking the Correlation on the Target variable using barchart

In [None]:
plt.figure(figsize=(25,10))
df.corr()['Purchased'].sort_values(ascending = False).plot(kind='bar')
plt.show()

### Checking the Purchased power in different Age groups 

In [None]:
age_bins = range(15,75,5)
medians_by_age_group = df.groupby(["Purchased",pd.cut(df['Age'], age_bins)]).median()
medians_by_age_group

medians_by_age_group.index = medians_by_age_group.index.set_names(['Purchased', 'Age_group'])
medians_by_age_group.reset_index(inplace=True)

fig, ax = plt.subplots(figsize=(12,5))
sns.barplot(x='Age_group', y='EstimatedSalary',hue="Purchased", data=medians_by_age_group,
            palette=['#cc66ff','#0066ff'],
            alpha=0.7,edgecolor='k',
            ax=ax)
ax.set_title('Median estimated salary of customers based on Age who purchased or not')
ax.set_xlabel('Age group')
plt.show()

### Checking the target Variable using CountPlot

In [None]:
plt.figure(figsize = (12,8))
plt.grid(True)
ax = sns.countplot(x='Purchased', data=df, palette='Spectral_r')
for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()+0.5))

## Section 03:- Scaling and Splitting the Dataset in to X_train and X_test.

In [None]:
X=df.iloc[:,0:-1]
y=df.iloc[:,-1]

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3,random_state=2,stratify=y)

## Section 04:- Model Building

### <u><b>Model 01:- Support Vector machine(SVM)</b></u>

In [None]:
model=SVC()
model.fit(X_train,y_train)
print(f' Training Accuracy is:- {model.score(X_train,y_train)}')
f'Test Accuracy is:- {model.score(X_test,y_test)}'

### <b><u>Model 02:- KNN</u></b> 

In [None]:
k = range(1,20)
trainingAccuracy = []
testAccuracy=[]
for i in k:
    knn = KNeighborsClassifier(n_neighbors=i,n_jobs=15,p=1,weights='distance')
    knn.fit(X_train,y_train)
    trainingacc = knn.score(X_train,y_train)
    trainingAccuracy.append(trainingacc)
    testAccuracy.append(knn.score(X_test,y_test))

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10,4))
plt.xlabel("value of K")
plt.ylabel("Accuracy of test and training")
plt.title("Select best value of k")
plt.plot(k,trainingAccuracy)
plt.plot(k,testAccuracy)
  #axes[0].legend(['loss','val_loss'])
axes.legend(["Training Accurracy","Test Accuracy"])
print("\n Best Test accuracy is:- ",max(testAccuracy))

#### <b><u>With the help of above chart inputing the values for KNN</u></b>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model=KNeighborsClassifier(n_jobs=15,n_neighbors=3,p=2,weights='uniform')
knn_model.fit(X_train,y_train)
print(f' Training Accuracy {knn_model.score(X_train,y_train)}')
f' Testing Accuracy {knn_model.score(X_test,y_test)}'

#### <b><u> Visualising the Classification and Misclassification of Train and Test Prediction </u></b>

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = scaler.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 1),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 1))
plt.contourf(X1, X2, knn_model.predict(scaler.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = scaler.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 1),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 1))
plt.contourf(X1, X2, knn_model.predict(scaler.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

#### <b><u>KNN With Hyper parameter Tuning</u></b>

In [None]:
k_range = list(range(1,50))
weight_options = ["uniform", "distance"]
pe=[1,2]

param_grid = dict(n_neighbors = k_range, weights = weight_options,p=pe)
knn = KNeighborsClassifier()
knngrid = GridSearchCV(knn, param_grid, cv = 10, scoring = 'accuracy',n_jobs=15)
knngrid.fit(X_train,y_train)

print ("Best score on 10 folds split Data on Train split is :- ",knngrid.best_score_)
print ("\n Best Param:- ",knngrid.best_params_)
print ("\n Best KNN Metric:- ", knngrid.best_estimator_)

print(f' \n Training Accuracy {knngrid.score(X_train,y_train)}')
f'Test Accuracy {knngrid.score(X_test,y_test)}'

#### <b><u>Checking confusion matrix for the above Hyperparameter tuned model</u></b>

In [None]:
y_predicted = knngrid.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Truth')

### <b><u> Modal 03:- Logistic Regression </u></b>

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='lbfgs', max_iter=1000)

log_model.fit(X_train, y_train)
print(f' Training Accuracy {log_model.score(X_train,y_train)}')
f'Test Accuracy {log_model.score(X_test,y_test)}'

### <b><u>Model 04:- Neural Network</u></b>

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
param_grid = [
        {
            'activation' : ['identity', 'logistic', 'tanh', 'relu'],
            'solver' : ['lbfgs', 'sgd', 'adam'],            
        }
       ]
clf = GridSearchCV(MLPClassifier(), param_grid, cv=folds,
                           scoring='accuracy',n_jobs=-1,verbose = 1,
)
clf.fit(X_train, y_train)
print(f' Training Accuracy {clf.score(X_train,y_train)}')
f'Test Accuracy {clf.score(X_test,y_test)}'

### <b><u>Best params used for MLPClassifier</u></b>

In [None]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results[cv_results.rank_test_score<5]
#print the optimum value of hyperparameters
print('Best hyperparameters: ', clf.best_params_)

### <b><u> Neural network with Dense layer and defining the Stop criteria on reachign the Accuracy threshold on Train data </u></b>

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>= 0.95):
      print("\nReached 97% accuracy so cancelling training!")
      self.model.stop_training = True
callbacks = myCallback()

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(800, activation=tf.nn.relu),
  tf.keras.layers.Dense(256, activation=tf.nn.relu),
  tf.keras.layers.Dense(256, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500,batch_size=32, callbacks=[callbacks])

In [None]:
print("Accuracy on test data is ",model.evaluate(X_test, y_test))
print("Predicted value of ytest[4] is ",np.argmax(y_predicted[4]))
print("Actual Predicted value is:- ",y_test[4])

### <b><u>Conclusion:-</u></b>
* KNN proves to be the Best model with good accuracy on test among all the models used above.