# **Social network product purchase**


### **Objective** : 
The objective of this analysis is to predict whether a person will buy a product displayed on a social network ad 

### Import Library

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve

sns.set(style = 'darkgrid')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



### Import DataSet

In [None]:
df = pd.read_csv('/kaggle/input/social-network-ads/Social_Network_Ads.csv')
df.sample(5)

In [None]:
# Check null values in each feature
df.isna().sum()

In [None]:
# Check duplicate value 
df.duplicated().sum()

In [None]:
df.info()

* 'Gender' : Categorical value
* 'Age' : Integer value
* 'EstimatedSalary' : Integer value
* 'Purchased' :Target variable        

In [None]:
# drop user id(no use)
df.drop('User ID', axis = 1, inplace=True)

In [None]:
df.describe()

In [None]:
# Checking outliers
fig, ax = plt.subplots(1,2, figsize = (15, 5))
df['Age'].plot.box(ax = ax[0], showmeans = True, meanline = True)
df['EstimatedSalary'].plot.box(ax = ax[1], showmeans = True, meanline = True)
#sns.boxplot(df['EstimatedSalary'], ax = ax[1])

In [None]:
df.hist(figsize = (15, 8))

In [None]:
df['Gender'].value_counts()

In [None]:
# pie plot
df['Gender'].value_counts().plot.pie(autopct = '%1.1f%%', shadow= True, figsize = (6, 8))

In [None]:
sns.countplot('Purchased', data = df)

In [None]:
df['Purchased'].value_counts().plot.pie(autopct = '%1.1f%%', shadow= True, figsize = (6, 8), explode = [0, 0.06])

In [None]:
# Bivariate analysis
sns.scatterplot('Age', 'EstimatedSalary', data = df, hue = 'Purchased')

In [None]:
sns.barplot('Gender', 'Age',data = df, hue = 'Purchased', ci = False)

In [None]:
sns.violinplot('Gender', 'Age',data = df, hue = 'Purchased')

In [None]:
sns.boxplot('Gender', 'EstimatedSalary',data = df, hue = 'Purchased')

##### There is no use of Gender feature because there is no(less) efferct of Gender on Purchased

In [None]:
df.drop('Gender', inplace=True, axis = 1)

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
X = df.drop('Purchased', axis = 1)
y = df['Purchased']

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
# Normalizing the data
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

### Logistics Regression

In [None]:
model_list = []
score_list = []

model1 = LogisticRegression()
model1.fit(X_train, y_train)
score1 = accuracy_score(y_test, model1.predict(X_test))
print("Model : Logistics Regression")
print('=====================================')
print('Accuracy : {:0.2f}'.format(score1))
model_list.append('Logistic_Regression')
score_list.append(score1)
print('=====================================')
print('Recall_Score : {:0.2f}'.format(recall_score(y_test, model1.predict(X_test))))
print('=====================================')
print('precission_Score : {:0.2f}'.format(precision_score(y_test, model1.predict(X_test))))

### Random Forest

In [None]:
model2 = RandomForestClassifier(random_state=5)
model2.fit(X_train, y_train)
score2 = accuracy_score(y_test, model2.predict(X_test))
print("Model : RandomForest Classifier")
print('=====================================')
print('Accuracy : {:0.2f}'.format(score2))
model_list.append('RandomForestClassifier')
score_list.append(score2)
print('=====================================')
print('Recall_Score : {:0.2f}'.format(recall_score(y_test, model2.predict(X_test))))
print('=====================================')
print('precission_Score : {:0.2f}'.format(precision_score(y_test, model2.predict(X_test))))

In [None]:
model3 = GradientBoostingClassifier(n_estimators=300, learning_rate=0.01)
model3.fit(X_train, y_train)
score3 = accuracy_score(y_test, model3.predict(X_test))
print("Model : GradientBoosting Classifier")
print('=====================================')
print('Accuracy : {:0.2f}'.format(score3))
model_list.append('GradientBoostingClassifier')
score_list.append(score3)
print('=====================================')
print('Recall_Score : {:0.2f}'.format(recall_score(y_test, model3.predict(X_test))))
print('=====================================')
print('precission_Score : {:0.2f}'.format(precision_score(y_test, model3.predict(X_test))))

In [None]:
model4 = AdaBoostClassifier()
model4.fit(X_train, y_train)
score4 = accuracy_score(y_test, model4.predict(X_test))
print("Model : AdaBoost Classifier")
print('=====================================')
print('Accuracy : {:0.2f}'.format(score4))
model_list.append('AdaBoostClassifier')
score_list.append(score4)
print('=====================================')
print('Recall_Score : {:0.2f}'.format(recall_score(y_test, model4.predict(X_test))))
print('=====================================')
print('precission_Score : {:0.2f}'.format(precision_score(y_test, model4.predict(X_test))))

In [None]:
model5 = SVC(kernel = 'rbf',  random_state = 2)
model5.fit(X_train, y_train)
score5 = accuracy_score(y_test, model5.predict(X_test))
print("Model : Support vector Classifier")
print('=====================================')
print('Accuracy : {:0.2f}'.format(score5))
model_list.append('Support vectorClassifier')
score_list.append(score5)
print('=====================================')
print('Recall_Score : {:0.2f}'.format(recall_score(y_test, model5.predict(X_test))))
print('=====================================')
print('precission_Score : {:0.2f}'.format(precision_score(y_test, model5.predict(X_test))))

In [None]:
model6 = GaussianNB()
model6.fit(X_train, y_train)
score6 = accuracy_score(y_test, model6.predict(X_test))
print("Model : GaussianNB")
print('=====================================')
print('Accuracy : {:0.2f}'.format(score6))
model_list.append('GaussianNB')
score_list.append(score6)
print('=====================================')
print('Recall_Score : {:0.2f}'.format(recall_score(y_test, model6.predict(X_test))))
print('=====================================')
print('precission_Score : {:0.2f}'.format(precision_score(y_test, model6.predict(X_test))))

In [None]:
plt.figure(figsize=(15, 5))
sns.lineplot(model_list, score_list)

In [None]:
plt.figure(figsize=(18, 5))
sns.barplot(model_list, score_list)

# we can say that Gradient Boosting, Random forest works well