In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/facebook-ads/Facebook_Ads_2.csv', encoding = 'ISO-8859-1')

In [None]:
df.head()

## EDA

In [None]:
clicked = df[df['Clicked']==1]
no_clicked = df[df['Clicked']==0]

In [None]:
print('Total=', len(df))
print('Number of customers clicked = ', len(clicked))
print('Number of customers not clicked = ', len(no_clicked))

In [None]:
sns.scatterplot(data = df, x = df['Time Spent on Site'], y = df['Salary'], hue=df['Clicked'])
plt.show()

In [None]:
plt.figure(figsize = (10,8))

sns.boxplot(data=df, x = 'Clicked', y = 'Salary')
plt.show()

In [None]:
plt.figure(figsize = (10,8))

sns.boxplot(data=df, x = 'Clicked', y = 'Time Spent on Site')
plt.show()

In [None]:
plt.figure(figsize = (10,8))
sns.histplot(df['Salary'], kde = True, bins = 40)
plt.show()

#### Histogram shows that most of the people are earning salary anywhere between 40000 and 80000. While very few are earning less than 20000. 

In [None]:
plt.figure(figsize = (10,8))
sns.histplot(df['Time Spent on Site'], kde = True, bins = 20)
plt.show()

#### People are spending on an average anywhere between 30 and 40 minutes of their time on the site. Highest time spent is around 60 mkinutes and least is around 5 minutes

## Preparing the data for training/Data Cleaning

#### We are dropping the varibales emails, country and names(Country can be made use if required)

In [None]:
df.columns

df = df.drop(['Names','emails','Country'],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler



In [None]:
X = df.drop(['Clicked'],axis = 1).values

y = df['Clicked'].values

ss = StandardScaler()

X = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [None]:
lr = LogisticRegression(random_state=0)

lr.fit(X_train, y_train)

## Model Testing

In [None]:
y_pred_train = lr.predict(X_train)
y_pred_train

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_train,y_pred_train)
sns.heatmap(cm, annot = True, fmt = 'd')

In [None]:
y_pred_test = lr.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,y_pred_test)
sns.heatmap(cm, annot = True, fmt = 'd')

In [None]:
print(classification_report(y_test, y_pred_test))

## Visualising the training dataset

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train

X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, stop = X_set[:,0].max()+1, step = 0.01),
                   np.arange(start =  X_set[:,1].min()-1, stop = X_set[:,1].max()+1, step = 0.01))
plt.contourf(X1,X2, lr.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
            alpha = 0.75, cmap = ListedColormap(('magenta','blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X1.min(), X2.max())

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0], X_set[y_set==j,1],
                c = ListedColormap(('magenta','blue'))(i),label = j)
    
plt.title('facebook ad: customer click prediction (training set)')
plt.xlabel('Time spent in Site')
plt.ylabel('salary')
plt.legend()
plt.show()

## Visualizing the test dataset

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = X_test, y_test

X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, stop = X_set[:,0].max()+1, step = 0.01),
                   np.arange(start =  X_set[:,1].min()-1, stop = X_set[:,1].max()+1, step = 0.01))
plt.contourf(X1,X2, lr.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
            alpha = 0.75, cmap = ListedColormap(('magenta','blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X1.min(), X2.max())

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0], X_set[y_set==j,1],
                c = ListedColormap(('magenta','blue'))(i),label = j)
    
plt.title('facebook ad: customer click prediction (testing set)')
plt.xlabel('Time spent in Site')
plt.ylabel('salary')
plt.legend()
plt.show()