In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re


In [None]:
train_data = pd.read_json('train.json', orient='index')
test_data = pd.read_json('test.json', orient='index')

In [None]:
train_data.head(10)

In [None]:
train_data.reset_index(level=0, inplace=True)
train_data.rename(columns={'segment':'target', 'index':'ID'}, inplace=True)
train_data.replace({'target':{'neg':0,'pos':1}}, inplace=True)

In [None]:
test_data.reset_index(level=0, inplace=True)
test_data.rename(columns={'index':'ID'}, inplace=True)

## Retrieving Genre Names & Creating columns for Each genre

In [None]:
train_data['genre_list']=[re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['genres']]
train_data['genre_list']=train_data['genre_list'].apply(lambda x: x.split(',')) 
#[x.split(',') for x in train_data['genre_list']]- It will work in same manner only diff is it returns list

In [None]:
test_data['genre_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['genres'] ]
test_data['genre_list'] = test_data['genre_list'].apply(lambda x: x.split(','))

In [None]:
k=train_data['genre_list'].apply(frozenset)  #Frozensets of genre for each row
t1 = frozenset.union(*k)                     #set of genres(union of frozen sets)
for i in t1:
    train_data[i] = train_data['genre_list'].apply(lambda x: int(i in x))

In [None]:
k=test_data['genre_list'].apply(frozenset)  #Frozensets of genre for each row
t2 = frozenset.union(*k)                    #set of genres(union of frozen sets)
for i in t2:
    test_data[i] = test_data['genre_list'].apply(lambda x: int(i in x))

## Retrieving DOW & Creating columns for Each DOW

In [None]:
train_data['dow_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['dow']]
train_data['dow_list'] = train_data['dow_list'].apply(lambda x: x.split(','))

In [None]:
test_data['dow_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['dow']]
test_data['dow_list'] = test_data['dow_list'].apply(lambda x: x.split(','))

In [None]:
k = train_data['dow_list'].apply(frozenset)
t1 = frozenset.union(*k)

for i in t1:
    col='dow'+str(i)
    train_data[col] = train_data['dow_list'].apply(lambda x: int(i in x))

In [None]:
k = test_data['dow_list'].apply(frozenset)
t2 = frozenset.union(*k)

for i in t2:
    col='dow'+str(i)
    test_data[col] = test_data['dow_list'].apply(lambda x: int(i in x))

## Retrieving City Names & Creating columns for Each city

In [None]:
train_data['city_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['cities']]
train_data['city_list'] = train_data['city_list'].apply(lambda x: x.split(','))

In [None]:
test_data['city_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['cities']]
test_data['city_list'] = test_data['city_list'].apply(lambda x: x.split(','))

## Retrieving TOD

In [None]:
train_data['tod_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['tod']]
train_data['tod_list'] = train_data['tod_list'].apply(lambda x:x.split(','))

In [None]:
test_data['tod_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['tod']]
test_data['tod_list'] = test_data['tod_list'].apply(lambda x: x.split(','))

In [None]:
k = train_data['tod_list'].apply(frozenset)
t1 = frozenset.union(*k)
for i in t1:
    col='tod'+str(i)
    train_data[col] = train_data['tod_list'].apply(lambda x: int(i in x))

In [None]:
k = test_data['tod_list'].apply(frozenset)
t2 = frozenset.union(*k)
for i in t2:
    col = 'tod'+str(i)
    test_data[col] = test_data['tod_list'].apply(lambda x: int(i in x))

## Calculating Total time for each row

In [None]:
t1=[]
for i in np.arange(train_data.shape[0]):
    a=np.sum(pd.Series(re.sub(pattern='.*\:', repl='', string=x) for x in (train_data['cities'][i].split(','))).apply(int))
    t1.append(a)

train_data['total_time']=t1

In [None]:
t1=[]
for i in np.arange(test_data.shape[0]):
    a=np.sum(pd.Series(re.sub(pattern='.*\:', repl='', string=x) for x in (test_data['cities'][i].split(','))).apply(int))
    t1.append(a)

test_data['total_time']=t1

## Counting Number of titles, genres etc, for each row

In [None]:
def wcount(p):
    return (p.count(',')+1)

In [None]:
train_data['title_count'] = train_data['titles'].map(wcount)
train_data['genre_count'] = train_data['genres'].map(wcount)
train_data['city_count'] = train_data['cities'].map(wcount)
train_data['tod_count'] = train_data['tod'].map(wcount)
train_data['dow_count'] = train_data['dow'].map(wcount)

In [None]:
test_data['title_count'] = test_data['titles'].map(wcount)
test_data['genre_count'] = test_data['genres'].map(wcount)
test_data['city_count'] = test_data['cities'].map(wcount)
test_data['tod_count'] = test_data['tod'].map(wcount)
test_data['dow_count'] = test_data['dow'].map(wcount)

In [None]:
drop_col = ['ID','cities','dow', 'genres', 'titles', 'tod', 'genre_list', 'dow_list', 'city_list', 'tod_list', 'NA'] ## NA
train = train_data.drop(labels=drop_col, axis=1)
test = test_data.drop(labels=drop_col, axis=1)
label = train['target']
train.drop(labels=['target'], inplace=True, axis=1) 

In [None]:
train.dtypes

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import auc

In [None]:
model = xgb.XGBClassifier(learning_rate=0.05, max_depth=5, min_child_weight=5, gamma=0.5, reg_lambda=8)
kfold = KFold(n_splits=3, random_state=2017)
grid_params = {'n_estimators':[400,450,500,550], 'colsample_bylevel':[0.5,0.4,0.3,0.2], 'subsample':[0.7,0.6,0.5,0.4]  }
grid1 = GridSearchCV(estimator=model,param_grid=grid_params, scoring='roc_auc', cv=kfold, n_jobs=-1)
grid1.fit(train, label)


In [None]:
print grid1.best_estimator_
print grid1.best_params_
print grid1.best_score_


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score



In [None]:
model = RandomForestClassifier(n_estimators=200, max_depth=5, )
kfold = KFold(n_splits=3, random_state=7)
results = cross_val_score(model,train,label,cv=kfold,scoring='roc_auc',n_jobs=-1)
results

In [None]:
model = RandomForestClassifier(n_estimators=1500, max_depth=8 )
model.fit(train,label);


In [None]:
prediction = model.predict_proba(test)

In [None]:
prediction = prediction[:,1]

In [None]:
trainX, valX, train_l, val_l = train_test_split(train,label, test_size=0.4, random_state=2017)


In [None]:
preds = train.columns
DMtrain_all = xgb.DMatrix(train, label, feature_names=preds)
DMtrain = xgb.DMatrix(trainX, train_l, feature_names=preds)
DMval = xgb.DMatrix(valX, val_l, feature_names=preds)
DMtest = xgb.DMatrix(test, feature_names=preds)

In [None]:
xgb_params={
    'eta':0.05,
    'max_depth':5,
    'colsample_bytree':1.0,
    'colsample_bylevel':0.3,
    'subsample':0.6,
    'objective':'binary:logistic',
    'eval_metric':'auc',
    'min_child_weight':5,
    'silent':1,
    'seed':2017,
    'nthread':4,
    'gamma':0.4,
    'lambda':8
}

In [None]:
watchlist = [(DMtrain, 'Train'), (DMval, 'Validation')]
num_rounds = 2000
model = xgb.train(xgb_params, DMtrain, num_rounds, watchlist, early_stopping_rounds=50,verbose_eval=5)
              

In [None]:
model = xgb.train(xgb_params, DMtrain_all, num_boost_round=int(410))

In [None]:
prediction = model.predict(DMtest)

In [None]:
sub_file = pd.read_csv('sample_submission.csv')
sub_file['ID'] = test_data['ID']
sub_file['segment'] = prediction

In [None]:
sub_file.to_csv('subm_file1.csv', index=False)