In [1]:
# pandas to setup the dataframe and logistic regression classifier to be used for making gender predictions
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR

In [2]:
# import the dataset
train = pd.read_csv('F:\\Lesara\\train.csv')
test = pd.read_csv('F:\\Lesara\\test.csv')

In [3]:
train.count()

client_id     3000000
host_name     3000000
page_path     3000000
click_time    3000000
gender        3000000
dtype: int64

In [4]:
test.count()

client_id     742827
host_name     742827
page_path     742827
click_time    742827
gender             0
dtype: int64

In [5]:
train.head()

Unnamed: 0,client_id,host_name,page_path,click_time,gender
0,ef90dd347a4d9c6ab54b2260c0ead5c52ebe480982aadc...,www.lesara.it,54fc1f523d80f473504ec21537ea4fdfaca8ae7d151a5c...,1502970344047,2
1,a2b59f5b6e46b21d9a3c73c10ded2c26e705362190d0f5...,www.lesara.it,8a5d4b53854e81fb2b6086d3df16f1ec07f4160a99c172...,1502970344529,2
2,d2f62a3ec51b100ea6e7338247e1820a6cba30b2ed1e48...,www.lesara.it,cd0ef75ebe8be4785e8ab68b029ee54e376700c3918405...,1502970345265,2
3,a25bc46af1f58744ea4cc210ad39e3b48c78e632de468a...,www.lesara.it,3f74d8ea5206f388aceaf6ff6a3d9111a1ade37fccb938...,1502970346856,2
4,4c414b0337a69934083023a82326883f3ba8bfd385d703...,www.lesara.it,a82cd1fe6e31fe07f238d202395c347dc60a6172d2bbfe...,1502970348018,1


In [6]:
# remove click time column as it is not useful for classification
del train['click_time']
del test['click_time']

In [8]:
train['client_id'].nunique()  # 3 million train records contain mix of 94594 users

94594

In [9]:
test['client_id'].nunique() # close to 0.7 million test records contain mix of 39252 users

39252

In [10]:
# check if train and test datasets contain any common users
commonusers = test['client_id'].isin(train['client_id'])

In [12]:
commonusers.to_csv('commonusers.csv')
# no common users found , all false results returned , hence eliminate client_id feature for classification

In [13]:
del train['client_id']
del test['client_id']

In [14]:
train.head()

Unnamed: 0,host_name,page_path,gender
0,www.lesara.it,54fc1f523d80f473504ec21537ea4fdfaca8ae7d151a5c...,2
1,www.lesara.it,8a5d4b53854e81fb2b6086d3df16f1ec07f4160a99c172...,2
2,www.lesara.it,cd0ef75ebe8be4785e8ab68b029ee54e376700c3918405...,2
3,www.lesara.it,3f74d8ea5206f388aceaf6ff6a3d9111a1ade37fccb938...,2
4,www.lesara.it,a82cd1fe6e31fe07f238d202395c347dc60a6172d2bbfe...,1


In [15]:
test.head()

Unnamed: 0,host_name,page_path,gender
0,www.lesara.it,91c35d72e63827e1c72b7db820e3d99c9bcd814b1cb2d9...,
1,www.lesara.it,abdf993a5273b2887f9cbc635e124baf0237e7e1f285da...,
2,www.lesara.co.uk,b2635717a4c9efb69edb07e6be2dc3f8a3bce36d6cb899...,
3,www.lesara.it,f526b7f7a6c718c1dafb0f77645402a666faaf25fe2d62...,
4,www.lesara.it,a889fea5a83488be7c1e4c5ed9fefdf0d5ef41ee3f48ca...,


In [16]:
# create the training predictor feature set and class label
x_train = pd.get_dummies(train['host_name'],train['page_path'])
y_train = train['gender']

In [17]:
# create test set and the variable to be predicted
x_test = pd.get_dummies(test['host_name'],test['page_path'])
y_test = test['gender']

In [26]:
# apply logistic regression classifier
model = LR(class_weight='balanced')

In [27]:
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [28]:
# make gender predictions
predict = model.predict(x_test)

In [29]:
predict

array([1, 1, 2, ..., 1, 1, 2], dtype=int64)

In [30]:
len(predict)

742827

In [31]:
genderpredict = pd.DataFrame(predict)

In [32]:
genderpredict.to_csv('gender.csv')