In [44]:
'''
The goal of this challenge is to build a model that predicts conversion rate and, based on the
model, come up with ideas to improve revenue.
'''

'\nThe goal of this challenge is to build a model that predicts conversion rate and, based on the\nmodel, come up with ideas to improve revenue.\n'

In [45]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from matplotlib import pyplot as plt
import scipy.stats as stats
import cPickle
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [46]:
def save_classifier(filename, clf):
    with open(filename, 'w') as fid:
        cPickle.dump(clf, fid)
    
def load_classifier(filename):
    with open(filename, 'r') as fid:
        clf = cPickle.load(fid)
    return clf 

In [47]:
#Reading the data

DATA_FILE = "data/conversion_data.csv"

df = pd.read_csv(DATA_FILE)
print "Printing few lines of the data"
print df.head()
print 
print "the dimensions of the data"
print df.shape
print
print "Summarizing the data"
print df.describe()

print
print "Observations:"
print "1. No missing values found"
print "2. Seeing that new_user and converted are categorical"

#print country.classes_
print df['country'].value_counts()
#print new_user.classes_
print df['new_user'].value_counts()
#print source.classes_
print df['source'].value_counts()
#print converted.classes_
print df['converted'].value_counts()

Printing few lines of the data
  country  age  new_user source  total_pages_visited  converted
0      UK   25         1    Ads                    1          0
1      US   23         1    Seo                    5          0
2      US   28         1    Seo                    4          0
3   China   39         1    Seo                    5          0
4      US   30         1    Seo                    6          0

the dimensions of the data
(316200, 6)

Summarizing the data
                 age       new_user  total_pages_visited      converted
count  316200.000000  316200.000000        316200.000000  316200.000000
mean       30.569858       0.685465             4.872966       0.032258
std         8.271802       0.464331             3.341104       0.176685
min        17.000000       0.000000             1.000000       0.000000
25%        24.000000       0.000000             2.000000       0.000000
50%        30.000000       1.000000             4.000000       0.000000
75%        36.00000

In [48]:
#uniform distributed
plt.plot(df['age'])
plt.savefig('Distribution of age')
plt.close()

plt.plot(df['total_pages_visited'])
plt.savefig('Distribution of total_pages_visited')
plt.close()

#outliers
#print sorted(df['age'], reverse =True)
df = df[df['age'] < 100]
print "the dimensions of the data"
print df.shape
print
print "Summarizing the data"
print df.describe()


#check country wise distribution given the conversion rate successful
df2 = df[df['converted']==1]
country  = df2.groupby('country')['country'].count()
print country

'''
#scatter plot between converted and total_pages_visited
plt.plot(df['total_pages_visited'], df['converted'])
plt.savefig('Scatter plot between total pages and conversion rate')
plt.close()
'''

the dimensions of the data
(316198, 6)

Summarizing the data
                 age       new_user  total_pages_visited      converted
count  316198.000000  316198.000000        316198.000000  316198.000000
mean       30.569311       0.685469             4.872918       0.032252
std         8.268958       0.464329             3.341053       0.176669
min        17.000000       0.000000             1.000000       0.000000
25%        24.000000       0.000000             2.000000       0.000000
50%        30.000000       1.000000             4.000000       0.000000
75%        36.000000       1.000000             7.000000       0.000000
max        79.000000       1.000000            29.000000       1.000000
country
China       102
Germany     815
UK         2549
US         6732
Name: country, dtype: int64


"\n#scatter plot between converted and total_pages_visited\nplt.plot(df['total_pages_visited'], df['converted'])\nplt.savefig('Scatter plot between total pages and conversion rate')\nplt.close()\n"

In [49]:
print 
print "Converting into categorical"
country = preprocessing.LabelEncoder()
df['country']=country.fit(df['country']).transform(df['country'])
print country.classes_
new_user = preprocessing.LabelEncoder()
df['new_user']=new_user.fit(df['new_user']).transform(df['new_user'])
print new_user.classes_
source = preprocessing.LabelEncoder()
df['source']=source.fit(df['source']).transform(df['source'])
converted = preprocessing.LabelEncoder()
df['converted']=converted.fit(df['converted']).transform(df['converted'])


print df.columns
from sklearn.utils import shuffle
df = shuffle(df)
train, test = train_test_split(df, test_size=0.3)
X_train, y_train = train.ix[:,:5], train['converted']
X_test, y_test = test.ix[:,:5], test['converted']

lreg_sci = LogisticRegression(penalty='l2')
lreg_sci.fit(X_train, y_train)
save_classifier('Classifier_logitReg.pkl', lreg_sci)
pred = lreg_sci.predict(X_test)
print classification_report(y_test, pred)


Converting into categorical
['China' 'Germany' 'UK' 'US']
[0 1]
Index([u'country', u'age', u'new_user', u'source', u'total_pages_visited',
       u'converted'],
      dtype='object')
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     91804
          1       0.86      0.66      0.75      3056

avg / total       0.98      0.99      0.98     94860

