In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re


In [2]:
train_data = pd.read_json('train.json', orient='index')
test_data = pd.read_json('test.json', orient='index')


In [3]:
train_data.head(10)

Unnamed: 0,cities,dow,genres,segment,titles,tod
train-1,"gurgaon:55494,delhi:31892","1:3412,3:15878,2:1737,5:10975,4:20974,7:17820,...","Cricket:82379,Kabaddi:255,Reality:4751",neg,"Top Raids: Haryana vs Services SCB:103,Day 4: ...","10:26,13:331,12:323,20:21864,21:16233,17:7953,..."
train-10,"delhi:5862,nagar:8916,mumbai:1593","1:5745,3:3025,2:3346,5:123,4:3007,7:1108,6:10","Cricket:15640,Wildlife:730",neg,"Dhoni Quits Captaincy:148,Day 4: India Move in...","11:1661,10:384,20:401,21:798,22:221,16:525,19:..."
train-100,navi mumbai:4142,3:4142,"LiveTV:13,Football:4129",neg,"Star Sports 4:13,Manchester United vs Everton:...","1:1207,0:2406,2:529"
train-1000,"new delhi:4131,chennai:2878,navi mumbai:1339","1:658,3:5867,5:413,4:1339,7:71","TalkShow:658,Cricket:7690",neg,"SRH vs RCB:701,KKR vs KXIP:1042,MI vs SRH:2288...","11:71,20:2417,21:1042,23:2288,19:1872,8:658"
train-10000,"gurgaon:6077,chennai:4055","1:1641,2:480,4:1445,7:1663,6:4900","Drama:5503,Cricket:3283,Reality:1345",neg,"MI vs KKR:304,Yeh Rishta Kya Kehlata Hai:5449,...","20:158,22:4139,17:67,23:1510,19:288,18:56,0:23..."
train-100000,"hyderabad:998,bangalore:2748,gulbarga:43317,be...","1:6707,3:1948,2:3574,5:8525,4:18938,7:8295,6:7344","Action:998,Drama:8795,Cricket:45541",neg,"India vs Australia 2nd Test English:2836,SRH v...","11:3450,10:1243,13:4420,12:4210,20:7050,21:770..."
train-100001,navi mumbai:10155,"1:1575,3:5330,2:1242,4:2007","Action:963,TalkShow:18,Romance:1357,Mythology:...",neg,"Jodi:7222,Maapillai:1357,Mahabharatham:594,Ban...","11:20,12:574,21:1357,22:1066,23:2290,0:4847"
train-100002,"delhi:1571,navi mumbai:12729","1:333,2:2233,5:739,4:268,7:10727","Drama:4344,Cricket:9956",neg,"Chandra Nandni:4344,India vs England 2nd T20I ...","11:242,15:419,14:1877,22:309,19:3063,18:64,1:2..."
train-100003,delhi:1318,"2:34,5:1074,7:210","Cricket:1248,Comedy:70",neg,"India vs Bangladesh Day 2 English:1066,Fielder...","10:844,20:65,17:69,23:78,19:40,9:222"
train-100004,"chandigarh:2214,delhi:3829,mumbai:9465","5:14,4:14292,7:1201","Action:86,Drama:4826,Cricket:10557,Kids:24,Tal...",neg,"The Jungle Book:24,Jolly LLB:4826,Escape Plan:...","11:1,13:1610,12:2626,20:2325,21:2985,17:1252,1..."


In [4]:
train_data.reset_index(level=0, inplace=True)
train_data.rename(columns={'segment':'target', 'index':'ID'}, inplace=True)
train_data.replace({'target':{'neg':0,'pos':1}}, inplace=True)

In [5]:
test_data.reset_index(level=0, inplace=True)
test_data.rename(columns={'index':'ID'}, inplace=True)

## Retrieving Genre Names & Creating columns for Each genre

In [6]:
train_data['genre_list']=[re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['genres']]
train_data['genre_list']=train_data['genre_list'].apply(lambda x: x.split(',')) 
#[x.split(',') for x in train_data['genre_list']]- It will work in same manner only diff is it returns list

In [7]:
test_data['genre_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['genres'] ]
test_data['genre_list'] = test_data['genre_list'].apply(lambda x: x.split(','))

In [8]:
k=train_data['genre_list'].apply(frozenset)  #Frozensets of genre for each row
t1 = frozenset.union(*k)                     #set of genres(union of frozen sets)
for i in t1:
    train_data[i] = train_data['genre_list'].apply(lambda x: int(i in x))

In [9]:
k=test_data['genre_list'].apply(frozenset)  #Frozensets of genre for each row
t2 = frozenset.union(*k)                    #set of genres(union of frozen sets)
for i in t2:
    test_data[i] = test_data['genre_list'].apply(lambda x: int(i in x))

## Retrieving DOW & Creating columns for Each DOW

In [10]:
train_data['dow_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['dow']]
train_data['dow_list'] = train_data['dow_list'].apply(lambda x: x.split(','))

In [11]:
test_data['dow_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['dow']]
test_data['dow_list'] = test_data['dow_list'].apply(lambda x: x.split(','))

In [12]:
k = train_data['dow_list'].apply(frozenset)
t1 = frozenset.union(*k)

for i in t1:
    col='dow'+str(i)
    train_data[col] = train_data['dow_list'].apply(lambda x: int(i in x))

In [13]:
k = test_data['dow_list'].apply(frozenset)
t2 = frozenset.union(*k)

for i in t2:
    col='dow'+str(i)
    test_data[col] = test_data['dow_list'].apply(lambda x: int(i in x))

## Retrieving City Names & Creating columns for Each city

In [14]:
train_data['city_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['cities']]
train_data['city_list'] = train_data['city_list'].apply(lambda x: x.split(','))

In [15]:
test_data['city_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['cities']]
test_data['city_list'] = test_data['city_list'].apply(lambda x: x.split(','))

## Retrieving TOD

In [16]:
train_data['tod_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in train_data['tod']]
train_data['tod_list'] = train_data['tod_list'].apply(lambda x:x.split(','))

In [17]:
test_data['tod_list'] = [re.sub(pattern='\:\d+', repl='', string=x) for x in test_data['tod']]
test_data['tod_list'] = test_data['tod_list'].apply(lambda x: x.split(','))

In [18]:
k = train_data['tod_list'].apply(frozenset)
t1 = frozenset.union(*k)
for i in t1:
    col='tod'+str(i)
    train_data[col] = train_data['tod_list'].apply(lambda x: int(i in x))

In [19]:
k = test_data['tod_list'].apply(frozenset)
t2 = frozenset.union(*k)
for i in t2:
    col = 'tod'+str(i)
    test_data[col] = test_data['tod_list'].apply(lambda x: int(i in x))

## Calculating Total time for each row

In [20]:
t1=[]
for i in np.arange(train_data.shape[0]):
    a=np.sum(pd.Series(re.sub(pattern='.*\:', repl='', string=x) for x in (train_data['cities'][i].split(','))).apply(int))
    t1.append(a)

train_data['total_time']=t1

In [21]:
t1=[]
for i in np.arange(test_data.shape[0]):
    a=np.sum(pd.Series(re.sub(pattern='.*\:', repl='', string=x) for x in (test_data['cities'][i].split(','))).apply(int))
    t1.append(a)

test_data['total_time']=t1

## Counting Number of titles, genres etc, for each row

In [22]:
def wcount(p):
    return (p.count(',')+1)

In [23]:
train_data['title_count'] = train_data['titles'].map(wcount)
train_data['genre_count'] = train_data['genres'].map(wcount)
train_data['city_count'] = train_data['cities'].map(wcount)
train_data['tod_count'] = train_data['tod'].map(wcount)
train_data['dow_count'] = train_data['dow'].map(wcount)

In [24]:
test_data['title_count'] = test_data['titles'].map(wcount)
test_data['genre_count'] = test_data['genres'].map(wcount)
test_data['city_count'] = test_data['cities'].map(wcount)
test_data['tod_count'] = test_data['tod'].map(wcount)
test_data['dow_count'] = test_data['dow'].map(wcount)

In [25]:
drop_col = ['ID','cities','dow', 'genres', 'titles', 'tod', 'genre_list', 'dow_list', 'city_list', 'tod_list',
           'NA']
train = train_data.drop(labels=drop_col, axis=1)
test = test_data.drop(labels=drop_col, axis=1)
label = train['target']
train.drop(labels=['target'], inplace=True, axis=1) 

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import auc

In [45]:
model = RandomForestClassifier(n_estimators=200, max_depth=5 )
kfold = KFold(n_splits=3, random_state=7)
results = cross_val_score(model,train,label,cv=kfold,scoring='roc_auc',n_jobs=-1)
results

array([ 0.78173452,  0.79900969,  0.79452246])

In [32]:
model = RandomForestClassifier(n_estimators=200, max_depth=7 )
model.fit(train,label);


In [43]:
prediction = model.predict_proba(test)

In [44]:
prediction = prediction[:,1]

In [132]:
import xgboost as xgb
from sklearn.model_selection import train_test_split


In [133]:
trainX, valX, train_l, val_l = train_test_split(train,label, test_size=0.33, random_state=2017)


In [134]:
preds = train.columns
DMtrain_all = xgb.DMatrix(train, label, feature_names=preds)
DMtrain = xgb.DMatrix(trainX, train_l, feature_names=preds)
DMval = xgb.DMatrix(valX, val_l, feature_names=preds)
DMtest = xgb.DMatrix(test, feature_names=preds)

In [135]:
xgb_params={
    'eta':0.1,
    'max_depth':6,
    'subsample':0.7,
    'objective':'binary:logistic',
    'eval_metric':'auc',
    'min_child_weight':1,
    'silent':1,
    'seed':7,
    'nthread':4
}

In [136]:
watchlist = [(DMtrain, 'Train'), (DMval, 'Validation')]
num_rounds = 2000
model = xgb.train(xgb_params, DMtrain, num_rounds, watchlist, early_stopping_rounds=50,verbose_eval=5)
              

[0]	Train-auc:0.78429	Validation-auc:0.779239
Multiple eval metrics have been passed: 'Validation-auc' will be used for early stopping.

Will train until Validation-auc hasn't improved in 50 rounds.
[5]	Train-auc:0.798306	Validation-auc:0.789199
[10]	Train-auc:0.802334	Validation-auc:0.792436
[15]	Train-auc:0.805481	Validation-auc:0.794851
[20]	Train-auc:0.807826	Validation-auc:0.796302
[25]	Train-auc:0.810553	Validation-auc:0.797575
[30]	Train-auc:0.813545	Validation-auc:0.79924
[35]	Train-auc:0.816104	Validation-auc:0.800367
[40]	Train-auc:0.818758	Validation-auc:0.801593
[45]	Train-auc:0.821336	Validation-auc:0.802599
[50]	Train-auc:0.823481	Validation-auc:0.803081
[55]	Train-auc:0.82518	Validation-auc:0.803466
[60]	Train-auc:0.827159	Validation-auc:0.804026
[65]	Train-auc:0.829049	Validation-auc:0.804311
[70]	Train-auc:0.8309	Validation-auc:0.804659
[75]	Train-auc:0.832746	Validation-auc:0.804763
[80]	Train-auc:0.833746	Validation-auc:0.804698
[85]	Train-auc:0.835473	Validation-auc

In [102]:
train.shape

(200000, 71)

In [111]:
model = xgb.train(xgb_params, DMtrain_all, num_boost_round=int(438/0.95))

In [112]:
prediction = model.predict(DMtest)

In [41]:
sub_file = pd.read_csv('sample_submission.csv')
sub_file['ID'] = test_data['ID']
sub_file['segment'] = prediction

In [42]:
sub_file.to_csv('subm_file.csv', index=False)