# Grouping by the md5 ids and modelling for each cluster

In [74]:
import pandas as pd
import numpy as np
import csv

In [75]:
x_train_file = "X_train.csv"
x_test_file = "X_test.csv"
y_train_file = "y_train.csv"

X_train = pd.read_csv(x_train_file)
X_test = pd.read_csv(x_test_file)
y_train = pd.read_csv(y_train_file)

# We just want the second column
y_train = y_train["Converted"]

In [76]:
bools = X_train.isnull().any(axis=1)
indices_of_nulls = bools[bools].index.tolist()
len(indices_of_nulls)

48

In [77]:
# Drop the date columns
X_train = X_train.drop('ReceivedDateTime', 1)
X_train = X_train.drop("TodayDate", 1)
X_train.shape

(2911318, 28)

In [78]:
X_train = X_train.dropna()
y_train = y_train.drop(y_train.index[indices_of_nulls])


In [80]:
print X_train.shape
print y_train.shape

(2911270, 28)
(2911270,)


In [81]:
# All the md5 keys
md5_ids = X_train["CustomerMD5Key"]

# Number of unique md5_ids
print len(np.unique(md5_ids))

206512


In [82]:
# Let's look at the rows for one unique id
md5_ids = np.unique(md5_ids)
single = X_train[X_train["CustomerMD5Key"] == md5_ids[10000]]
single

Unnamed: 0.1,Unnamed: 0,CustomerMD5Key,SCID,SelectedPackage,FirstDriverMaritalStatus,CarAnnualMileage,CarFuelId,CarUsageId,FirstDriverAge,CarInsuredValue,...,CarTransmissionId,SocioDemographicId,PolicyHolderResidencyArea,AllDriversNbConvictions,RatedDriverNumber,IsPolicyholderAHomeowner,CarMakeId,DaysSinceCarPurchase,NameOfPolicyProduct,AffinityCodeId
368965,368965,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NV,39.0
477571,477571,0x0c642d72546b3d1439685626683af090,A03440,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NW,31.0
931846,931846,0x0c642d72546b3d1439685626683af090,A08213,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NP,0.0
1078576,1078576,0x0c642d72546b3d1439685626683af090,A08213,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NW,0.0
1093445,1093445,0x0c642d72546b3d1439685626683af090,A03440,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NV,31.0
1221040,1221040,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NP,39.0
1371224,1371224,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NC,39.0
1431524,1431524,0x0c642d72546b3d1439685626683af090,A09961,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,ND,0.0
1464571,1464571,0x0c642d72546b3d1439685626683af090,A03440,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NC,31.0
1810270,1810270,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,...,1.0,130.0,23.0,0.0,1.0,0.0,12.0,700.0,NW,39.0


In [83]:
single.loc[:, :single.columns[10]]

Unnamed: 0.1,Unnamed: 0,CustomerMD5Key,SCID,SelectedPackage,FirstDriverMaritalStatus,CarAnnualMileage,CarFuelId,CarUsageId,FirstDriverAge,CarInsuredValue,CarAge
368965,368965,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
477571,477571,0x0c642d72546b3d1439685626683af090,A03440,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
931846,931846,0x0c642d72546b3d1439685626683af090,A08213,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1078576,1078576,0x0c642d72546b3d1439685626683af090,A08213,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1093445,1093445,0x0c642d72546b3d1439685626683af090,A03440,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1221040,1221040,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1371224,1371224,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1431524,1431524,0x0c642d72546b3d1439685626683af090,A09961,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1464571,1464571,0x0c642d72546b3d1439685626683af090,A03440,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0
1810270,1810270,0x0c642d72546b3d1439685626683af090,A04402,3.0,1.0,6001.0,1.0,0.0,50.0,0.0,12.0


In [84]:
single.loc[:, single.columns[10]:single.columns[20]]

Unnamed: 0,CarAge,FirstDriverDrivingLicenseNumberY,VoluntaryExcess,CarParkingTypeId,PolicyHolderNoClaimDiscountYears,FirstDriverDrivingLicenceType,CoverIsNoClaimDiscountSelected,CarDrivingEntitlement,CarTransmissionId,SocioDemographicId,PolicyHolderResidencyArea
368965,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
477571,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
931846,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1078576,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1093445,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1221040,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1371224,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1431524,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1464571,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0
1810270,12.0,30.0,250.0,2.0,20.0,1.0,0.0,1.0,1.0,130.0,23.0


In [85]:
single.loc[:, single.columns[20]:]

Unnamed: 0,PolicyHolderResidencyArea,AllDriversNbConvictions,RatedDriverNumber,IsPolicyholderAHomeowner,CarMakeId,DaysSinceCarPurchase,NameOfPolicyProduct,AffinityCodeId
368965,23.0,0.0,1.0,0.0,12.0,700.0,NV,39.0
477571,23.0,0.0,1.0,0.0,12.0,700.0,NW,31.0
931846,23.0,0.0,1.0,0.0,12.0,700.0,NP,0.0
1078576,23.0,0.0,1.0,0.0,12.0,700.0,NW,0.0
1093445,23.0,0.0,1.0,0.0,12.0,700.0,NV,31.0
1221040,23.0,0.0,1.0,0.0,12.0,700.0,NP,39.0
1371224,23.0,0.0,1.0,0.0,12.0,700.0,NC,39.0
1431524,23.0,0.0,1.0,0.0,12.0,700.0,ND,0.0
1464571,23.0,0.0,1.0,0.0,12.0,700.0,NC,31.0
1810270,23.0,0.0,1.0,0.0,12.0,700.0,NW,39.0


## So for each user, the differing variables are SCID, NameOfPolicyProduct, AffinityCodeId

So let's just take these three variables and run the classifiers

In [86]:
X_train = X_train.loc[:, ["SCID", "NameOfPolicyProduct", "AffinityCodeId"]]

In [87]:
X_train.head()

Unnamed: 0,SCID,NameOfPolicyProduct,AffinityCodeId
0,A10161,NC,0.0
1,A04779,NC,63.0
2,A04402,NP,39.0
3,A10099,NX,0.0
4,A03440,NP,31.0


In [88]:
# Create a hash for SCID and Policy Product
scids = np.unique(X_train.loc[:, "SCID"])
policy = np.unique(X_train.loc[:, "NameOfPolicyProduct"])
affinity = np.unique(X_train.loc[:, "AffinityCodeId"])

In [90]:
print len(scids)
print len(policy)
print len(affinity)
print max(affinity)

904
17
67
99.0


In [91]:
print X_train.shape
print y_train.shape

(2911270, 3)
(2911270,)


In [92]:
updated_policies = np.array([np.arange(len(policy))[val==policy][0] for val in X_train['NameOfPolicyProduct']])
updated_scids = np.array([np.arange(len(scids))[val==scids][0] for val in X_train['SCID']])
X_train["SCID"] = updated_scids

X_train["NameOfPolicyProduct"] = updated_policies

In [93]:
print X_train.shape
print y_train.shape

(2911270, 3)
(2911270,)


In [99]:
# Now run some very standard classifiers
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

param_grid = [{'clf__penalty': ['l1', 'l2'],
              'clf__C': [0.1, 0.5, 1.0, 10.0, 100.0]}]
pipe_lr = Pipeline([("clf", LogisticRegression())])

gs_pipe_lr = GridSearchCV(pipe_lr, param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)

In [95]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [103]:
print X_train.shape
print y_train.shape

(2911270, 3)
(2911270,)


In [106]:
from sklearn.cross_validation import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [107]:
gs_pipe_lr.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'clf__penalty': ['l1', 'l2'], 'clf__C': [0.1, 0.5, 1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1)

In [108]:
print gs_pipe_lr.best_params_
print gs_pipe_lr.best_score_

{'clf__penalty': 'l1', 'clf__C': 0.1}
0.992172119973


In [109]:
clf = gs_pipe_lr.best_estimator_

In [111]:
print clf.score(X_valid, y_valid)

0.992232784605


In [114]:
# Preproces X_test
X_test = X_test.loc[:, ["SCID", "NameOfPolicyProduct", "AffinityCodeId"]]
updated_policies = np.array([np.arange(len(policy))[val==policy][0] for val in X_test['NameOfPolicyProduct']])

updated_scids = []
for val in X_test["SCID"]:
    try:
        p = np.arange(len(scids))[val==scids][0]
    except:
        p = len(scids) + 1
    updated_scids.append(p)
updated_scids = np.array(updated_scids)
# updated_scids = np.array([np.arange(len(scids))[val==scids][0] for val in X_test['SCID']])
X_test["SCID"] = updated_scids
X_test["NameOfPolicyProduct"] = updated_policies

In [115]:
pred = clf.predict_proba(X_test)

In [117]:
pred = pd.DataFrame(pred)
pred = pred[1]

In [120]:
pred = np.array(pred)
np.savetxt("Y_test.predict", pred)