# Model Implementation

In [34]:
# importing necessary libraries 
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [35]:
df = pd.read_csv(os.getcwd()+'/data/final.csv')
df.head()

Unnamed: 0,id,first_name,last_name,email_id,birthdate,street_address,zipcode,city,state,phone_number,active,region,gender,age,connection_type,average_call_length,app,MB used,personas
0,1,Eustace,McCoid,emccoid0@parallels.com,5/18/59,0308 Bunker Hill Parkway,92844,Garden Grove,California,310-554-8521,False,PacificCoastal,Male,60,new,60.7,Venmo,805.24,Matt
1,1,Eustace,McCoid,emccoid0@parallels.com,5/18/59,0308 Bunker Hill Parkway,92844,Garden Grove,California,310-554-8521,False,PacificCoastal,Male,60,new,60.7,Google,164.21,Jason
2,1,Eustace,McCoid,emccoid0@parallels.com,5/18/59,0308 Bunker Hill Parkway,92844,Garden Grove,California,310-554-8521,False,PacificCoastal,Male,60,new,60.7,Wikipedia,311.08,Andrew
3,1,Eustace,McCoid,emccoid0@parallels.com,5/18/59,0308 Bunker Hill Parkway,92844,Garden Grove,California,310-554-8521,False,PacificCoastal,Male,60,new,60.7,Twitter,77.8,Jason
4,1,Eustace,McCoid,emccoid0@parallels.com,5/18/59,0308 Bunker Hill Parkway,92844,Garden Grove,California,310-554-8521,False,PacificCoastal,Male,60,new,60.7,Instagram,648.48,David


### Label Encoding the training data

In [36]:
#****************************Labeling personas**************

personas = df['personas'].values

#getting a copy
df_personas = pd.DataFrame({'personas':personas})

#Encoding
le = LabelEncoder()

le.fit(personas)
personas = le.transform(personas)
df['personas'] = personas

#Store in temp dataframe
df_personas['encoded'] = personas

#**********************Labeling regions*******************

region = df['region'].values

df_regions = pd.DataFrame({'region':region})

le = LabelEncoder()

le.fit(region)
region = le.transform(region)
df['region'] = region

#getting a copy
df_regions['encoded'] = region

#**********************Labeling gender*******************

gender = df['gender'].values

df_gender = pd.DataFrame({'gender':gender})

le = LabelEncoder()

le.fit(gender)
gender = le.transform(gender)
df['gender'] = gender

#getting a copy
df_gender['encoded'] = gender

#**********************Labeling active*******************

active = df['active'].values

df_active = pd.DataFrame({'active':active})

le = LabelEncoder()

le.fit(active)
active = le.transform(active)
df['active'] = active

#getting a copy
df_active['encoded'] = active

#**********************Labeling apps*******************

app = df['app'].values

df_app = pd.DataFrame({'app':app})

le = LabelEncoder()

le.fit(app)
app = le.transform(app)
df['app'] = app

#getting a copy
df_app['encoded'] = app

#**********************Labeling connection*******************

con = df['connection_type'].values

df_con = pd.DataFrame({'connection':con})

le = LabelEncoder()

le.fit(con)
con = le.transform(con)
df['connection_type'] = con

#getting a copy
df_con['encoded'] = con

#keep only required characteristics
df.drop(['id','first_name','last_name','email_id','birthdate','street_address','zipcode',
                     'city','state','phone_number'],axis=1,inplace = True)
df.head()

Unnamed: 0,active,region,gender,age,connection_type,average_call_length,app,MB used,personas
0,0,3,1,60,0,60.7,13,805.24,6
1,0,3,1,60,0,60.7,6,164.21,4
2,0,3,1,60,0,60.7,15,311.08,0
3,0,3,1,60,0,60.7,12,77.8,4
4,0,3,1,60,0,60.7,7,648.48,1


### Split the dataset into training and testing samples 

In [37]:
#independent variables
X= df[['age','gender','active','region','MB used','app','connection_type','average_call_length']]

#target variable
y = df['personas']

#split the data in 70-30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) 

In [38]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

print("Training Accuracy",gnb.score(X_train, y_train))

nb_pred = gnb.predict(X_test)
#Take the two values for comparison as actual and predicted
pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': nb_pred})


# accuracy on X_test 
accuracy = gnb.score(X_test, y_test) 
print('Testing Accuracy:',accuracy)

pred_df.head(10)

Training Accuracy 0.9959596612579998
Testing Accuracy: 0.9962292609351433


Unnamed: 0,Actual,Predicted
12115,4,4
41266,2,2
39839,1,1
9340,1,1
34923,6,6
17145,2,2
27471,5,5
42372,6,6
26373,5,5
14377,0,0


In [39]:
#Get all the encoded values for reference
dfr= df_regions.groupby('region')
dfr.first()

Unnamed: 0_level_0,encoded
region,Unnamed: 1_level_1
Mid-atlantic,0
Midwest,1
New-England,2
PacificCoastal,3
RockyMountain,4
Southern,5
Southwest,6


In [40]:
#Get all the encoded values for reference
dfg= df_gender.groupby('gender')
dfg.first()

Unnamed: 0_level_0,encoded
gender,Unnamed: 1_level_1
Female,0
Male,1


In [41]:
dfa= df_active.groupby('active')
dfa.first()

Unnamed: 0_level_0,encoded
active,Unnamed: 1_level_1
False,0
True,1


In [42]:
dfap = df_app.groupby('app')
dfap.first()

Unnamed: 0_level_0,encoded
app,Unnamed: 1_level_1
Amazon,0
Bigcommerce,1
Bing,2
Blogspot,3
Facebook,4
Gmail,5
Google,6
Instagram,7
Netflix,8
Reddit,9


In [43]:
dfp = df_personas.groupby('personas')
dfp.first()

Unnamed: 0_level_0,encoded
personas,Unnamed: 1_level_1
Andrew,0
David,1
Erica,2
Gina,3
Jason,4
Jessica,5
Matt,6
Tiffany,7


# Model Classification

### Label Encoding the sample customers

In [45]:
df_sample_c = pd.read_csv(os.getcwd()+"/data/cust_sample.csv")

df_sample_int = pd.read_csv(os.getcwd()+"/data/internet_sample.csv")

df_sample = pd.merge(df_sample_c, df_sample_int, how='left', on = 'id')

df_sample.head()

df_sample[df.isnull().any(axis = 1)]

df_sample['MB used'].fillna(df['MB used'].mean(), inplace=True)

df_sample['app'].fillna(method='ffill', inplace=True)

df_sample_copy = df_sample.copy()

df_sample[df_sample.isnull().any(axis = 1)]
# df.head()

df_sample.to_csv(os.getcwd()+"/data/final_sample.csv")

df_sample.head()

  if __name__ == '__main__':


Unnamed: 0,id,first_name,last_name,gender,birthdate,zipcode,city,state,active,region,connection_type,average_call_length,age,app,MB used
0,10001,Gayle,Spinelli,Male,2/10/1984,98175,Seattle,Washington,False,PacificCoastal,new,48.35,35,Instagram,784.2
1,10001,Gayle,Spinelli,Male,2/10/1984,98175,Seattle,Washington,False,PacificCoastal,new,48.35,35,Wikipedia,250.97
2,10001,Gayle,Spinelli,Male,2/10/1984,98175,Seattle,Washington,False,PacificCoastal,new,48.35,35,Amazon,328.09
3,10001,Gayle,Spinelli,Male,2/10/1984,98175,Seattle,Washington,False,PacificCoastal,new,48.35,35,Twitter,358.73
4,10001,Gayle,Spinelli,Male,2/10/1984,98175,Seattle,Washington,False,PacificCoastal,new,48.35,35,Snapchat,235.42


In [46]:
# df_sample = pd.read_csv(os.getcwd()+"\\data\\final_sample.csv")

#independent variables
X1= df_sample[['age','gender','active','region','MB used','app','connection_type','average_call_length']]

#**********************Labeling regions*******************

region1 = df_sample['region'].values

le = LabelEncoder()
le.fit(region1)
region1 = le.transform(region1)
df_sample['region'] = region1

#**********************Labeling gender*******************

gender1 = df_sample['gender'].values
le = LabelEncoder()

le.fit(gender1)
gender1 = le.transform(gender1)
df_sample['gender'] = gender1

#**********************Labeling active*******************

active1 = df_sample['active'].values
le = LabelEncoder()

le.fit(active1)
active1 = le.transform(active1)
df_sample['active'] = active1

#**********************Labeling apps*******************

app1 = df_sample['app'].values
le = LabelEncoder()

le.fit(app1)
app1 = le.transform(app1)
df_sample['app'] = app1

#**********************Labeling connection*******************

con1 = df_sample['connection_type'].values

le = LabelEncoder()

le.fit(con1)
con1 = le.transform(con1)
df_sample['connection_type'] = con1

### Predicting personas for sample data

In [47]:

length = len(region1)

ag = df_sample['age'].values
r = df_sample['region'].values
g = df_sample['gender'].values
a = df_sample['active'].values
m = df_sample['MB used'].values
ap = df_sample['app'].values
c = df_sample['connection_type'].values
ca = df_sample['average_call_length'].values

pred = []
persona_list = []

for i in range(0,length):
    pr = gnb.predict(np.array([[ag[i],g[i],a[i],r[i],m[i],ap[i],c[i],ca[i]]]))
    pred.append(pr[0])

for i in pred:
    if i==0:
        persona_list.append('Andrew')
    elif i==1:
        persona_list.append('David')
    elif i==2:
        persona_list.append('Erica')
    elif i==3:
        persona_list.append('Gina')
    elif i==4:
        persona_list.append('Jason')
    elif i==5:
        persona_list.append('Jessica')
    elif i==6:
        persona_list.append('Matt')
    else:
        persona_list.append('Tiffany')

df_sample['personas'] = persona_list

df_sample.head()

Unnamed: 0,id,first_name,last_name,gender,birthdate,zipcode,city,state,active,region,connection_type,average_call_length,age,app,MB used,personas
0,10001,Gayle,Spinelli,1,2/10/1984,98175,Seattle,Washington,0,3,0,48.35,35,7,784.2,Matt
1,10001,Gayle,Spinelli,1,2/10/1984,98175,Seattle,Washington,0,3,0,48.35,35,15,250.97,Jason
2,10001,Gayle,Spinelli,1,2/10/1984,98175,Seattle,Washington,0,3,0,48.35,35,0,328.09,Andrew
3,10001,Gayle,Spinelli,1,2/10/1984,98175,Seattle,Washington,0,3,0,48.35,35,12,358.73,Andrew
4,10001,Gayle,Spinelli,1,2/10/1984,98175,Seattle,Washington,0,3,0,48.35,35,10,235.42,Jason


In [48]:
# print(pred)
df_sample.to_csv(os.getcwd()+'/data/final_sample_data.csv')