In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## A. Exploratory Data Analysis

In [None]:
df= pd.read_csv("../input/social-network-ads/Social_Network_Ads.csv")
df

In [None]:
df.corr() # Lets look at statistical correlation
#There is positive high correlation between Age and Purchased items

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),cmap="jet",annot=True)
#here we visualize the correlations

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(data=df, x="Age",hue="Purchased")
#We can see that the effects of ads is highest between ages 26 and 40
#Therefore these ge groups are more suitable to be target group for the commercial ads

In [None]:
df.isnull().sum() # we do not have any missing values

In [None]:
df.info() # we do not have any non numerical values in the columns

## B. Preparing Data For Algorithms

1. Splitting Data into Train and Test Sets

In [None]:
X = df.drop("Purchased",axis=1).values
X.shape

In [None]:
y = df["Purchased"].values
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

2.Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()
X_train= ss.fit_transform(X_train)
X_test= ss.transform(X_test)
X_train[0]
#We rescale all of the features with standart scaler which produces values between -1 and 1
#This secures there is no value gap between features 

## C. Training Classification Algorithms

## 1. Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
logistic= LogisticRegression()
logistic.fit(X_train, y_train)
predictions_logistic= logistic.predict(X_test)

Evaluation of the Performance of Logistic Regression

In [None]:
df1=pd.DataFrame(y_test,columns=["Original Values"])
df2=pd.DataFrame(predictions_logistic,columns=[ "Predictions of Logistic regression"])
pd.concat([df1,df2],axis=1).head()
#Here we can compare the predictions of our model with the actual values

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(classification_report(y_test, predictions_logistic))
print(confusion_matrix(y_test, predictions_logistic))
print(accuracy_score(y_test, predictions_logistic))

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = ss.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, logistic.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
#Visualization of the predictions of the Logistic Regression in Train Set

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = ss.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, logistic.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
#Visualization of the predictions of the Logistic Regression in Test Set

## 2. K Nearest Neighbors:

How Algorithm Works:

*K nearest neighbors is a simple algorithm that stores all available cases and classifies new cases based on a similarity measure (e.g., distance functions).

Firstly we store all the dataSecondly we calculate the data from x to all points in our data set, x indicationg particular new data point Then we sort the points near data by increasing distance from xFinally we predict the majority label of K, which is number and represent closest points

*Choosing a K will effect what class a new point is assigned to: if we choose k=3, then the algorithm looks at the three nearest neighbors to this new point if we set k=6, then the algorithm looks at the six nearest neighbors to this new point and decide according to the majority of these 6 neighbors. If we set larger k values,we get a cleaner cutoff at the expense of mislabelling some points

In [None]:
plt.figure(figsize=(12,10))
plt.imshow(plt.imread("../input/knneigbor/knn.PNG"))

In this case, we have data points of Class A and B. We want to predict what the star (test data point) is. If we consider a k value of 3 (3 nearest data points) we will obtain a prediction of Class B. Yet if we consider a k value of 6, we will obtain a prediction of Class A.Therefore, the value of k is very important for our model's success.

In [None]:
from sklearn.neighbors import KNeighborsClassifier


Choosing true k value is very important. Instead of using different k vlaues which will be time consuming, we can use a function in order to choose the best k.

In [None]:
error_rate=list()
#here we iterate meny different k values and plot their error rates 
#and discover which one is better than others and has the lowest error rate
for i in range(1,40):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    prediction_i=knn.predict(X_test)
    error_rate.append(np.mean(prediction_i != y_test))

In [None]:
# Now we will plot the prediction error rates of different k values
plt.figure(figsize=(15,10))
plt.plot(range(1,40),error_rate, color="blue", linestyle="--",marker="o",markerfacecolor="red",markersize=10)
plt.title("Error Rate vs K Value")
plt.xlabel="K Value"
plt.ylabel("Error Rate")

As we can see in the figure above, k between 5 and 37 gives the least error rate,so we will use it for better predictions

In [None]:
knn=KNeighborsClassifier(n_neighbors=5) # we choose 5 as neigbor parameter
knn.fit(X_train,y_train)
knn_predictions=knn.predict(X_test)

Evaluation of the Performance of  K Neares Neighbors

In [None]:
df1=pd.DataFrame(y_test,columns=["Original Values"])
df2=pd.DataFrame(knn_predictions,columns=[ "Predictions of KNN"])
pd.concat([df1,df2],axis=1).head()

In [None]:
print(classification_report(y_test, knn_predictions))
print(confusion_matrix(y_test, knn_predictions))
print(accuracy_score(y_test, knn_predictions))
#KNN has higher performance than Logistic Regression in this dataset

Visualising the Test set results

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = ss.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 1),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 1))
plt.contourf(X1, X2, knn.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('blue', 'purple')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('blue', 'purple'))(i), label = j)
plt.title('K-NN (Test set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

## 3. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_predictions= dtree.predict(X_test)

In [None]:
df1=pd.DataFrame(y_test,columns=["Original Values"])
df2=pd.DataFrame(dtree_predictions,columns=[ "Predictions of Decision Tree Classifier"])
pd.concat([df1,df2],axis=1) 

In [None]:
print(classification_report(y_test, dtree_predictions))
print(confusion_matrix(y_test, dtree_predictions))
print(accuracy_score(y_test, dtree_predictions))
#Decision has higher performance than Logistic Regression, but lower than KNN  in this dataset

Visualising the Test set results


In [None]:
X_set, y_set = ss.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, dtree.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Decision Tree Classification (Test set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend

## 4. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
random=RandomForestClassifier()
random.fit(X_train,y_train)
random_predictions= random.predict(X_test)

In [None]:
df1=pd.DataFrame(y_test,columns=["Original Values"])
df2=pd.DataFrame(random_predictions,columns=[ "Predictions of Random Forest Classifier"])
pd.concat([df1,df2],axis=1).head()

In [None]:
print(classification_report(y_test, random_predictions))
print(confusion_matrix(y_test, random_predictions))
print(accuracy_score(y_test, random_predictions))
#Random Forest has the second best position after K Nearest Neighbors Algorithm

Visualising the Test set results

In [None]:
X_set, y_set = ss.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, random.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Random Forest Classification (Test set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend

Visualising the Test set results

## 5. Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
bayes=GaussianNB()
bayes.fit(X_train, y_train)
bayes_predictions=bayes.predict(X_test)

In [None]:
df1=pd.DataFrame(y_test,columns=["Original Values"])
df2=pd.DataFrame(random_predictions,columns=[ "Predictions of Naive Bayes Classifier"])
pd.concat([df1,df2],axis=1).head()

In [None]:
print(classification_report(y_test, bayes_predictions))
print(confusion_matrix(y_test, bayes_predictions))
print(accuracy_score(y_test, bayes_predictions))
#Naive Bayes has better predictions than logistic regression, but worse than the restof algorithms

Visualising the Test set results

In [None]:
X_set, y_set = ss.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, bayes.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Random Forest Classification (Test set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend

## 6. Support Vector Machines

In [None]:
from sklearn.svm import SVC
svclassifier= SVC(kernel="linear")
svclassifier.fit(X_train, y_train)
svc_predictions= svclassifier.predict(X_test)

In [None]:
df1=pd.DataFrame(y_test,columns=["Original Values"])
df2=pd.DataFrame(svc_predictions,columns=[ "Predictions of Support Vector Machines"])
pd.concat([df1,df2],axis=1).head()

In [None]:
print(classification_report(y_test, svc_predictions))
print(confusion_matrix(y_test,svc_predictions))
print(accuracy_score(y_test, svc_predictions))
#Support Vector Machines has almost the same results as Naive Bayes

Visualization of Test Results

In [None]:
X_set, y_set = ss.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, svclassifier.predict(ss.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('SVM (Test set)')
#plt.xlabel('Age')
#plt.ylabel('Estimated Salary')
plt.legend()
plt.show()