In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import pandas_profiling
import matplotlib.pyplot as plt 
import seaborn as sns
import sklearn
from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.max_columns', 40)
%matplotlib inline

In [None]:
test = pd.read_csv("/kaggle/input/trell-social-media-usage-data/test_age_dataset.csv")
test.head(10)

In [None]:
df = pd.read_csv("/kaggle/input/trell-social-media-usage-data/train_age_dataset.csv")
df.head(10)

There is an `Unnamed: 0` column present which usually happens when one doesn't mentions `index=False` while saving a dataframe as csv. This will probably be a source of data leakage. I will look into this later.

In [None]:
df.shape, test.shape

In [None]:
categorical_cols = ['tier', 'gender']

target = 'age_group'

cont_numerical_cols = ['following_rate', 'followers_avg_age',
       'following_avg_age', 'max_repetitive_punc',
       'num_of_hashtags_per_action', 'emoji_count_per_action',
       'punctuations_per_action', 'number_of_words_per_action',
       'avgCompletion', 'avgTimeSpent', 'avgDuration', 'avgComments',
       'creations', 'content_views', 'num_of_comments',
       'weekends_trails_watched_per_day', 'weekdays_trails_watched_per_day',
       'slot1_trails_watched_per_day', 'slot2_trails_watched_per_day',
       'slot3_trails_watched_per_day', 'slot4_trails_watched_per_day', 'avgt2']

#check
set(df.columns) == set(["Unnamed: 0", 'userId', target] + categorical_cols + cont_numerical_cols)

In [None]:
df.describe()

In [None]:
df.loc[:,categorical_cols+[target]] = df.loc[:,categorical_cols+[target]].astype('category')
test.loc[:,categorical_cols] = test.loc[:,categorical_cols].astype('category')

In [None]:
pd.crosstab(df.tier, df.age_group)

In [None]:
pd.crosstab(df.gender, df.age_group)    

In [None]:
df[target].value_counts(sort=False)

In target variable `age_group`, occurances of `class 2,3 and 4` is pretty balanced while `class 1` has a `5X` occurance. What probably happened here is that the competition dataset has been sampled from a bigger dataset and the organizers intentionally created the imbalance by adding additional occurances of `class 1`. 

In [None]:
fig, axs = plt.subplots(1,2, figsize= (8,4))
pd.crosstab(df.gender, df.age_group).plot(kind='bar', ax=axs[0])  
pd.crosstab(df.tier, df.age_group).plot(kind='bar', ax=axs[1])  
plt.tight_layout()

All the categorical variables are highly imbalanced.

In [None]:
fig,ax = plt.subplots(nrows=8, ncols=3, figsize=(12,8*4))
for i,col in enumerate(cont_numerical_cols):
    sns.violinplot(x=target,y=col,data=df, ax=ax[i//3][i%3], )
plt.tight_layout()

In [None]:
g = sns.FacetGrid(df, hue=target);
g.map(sns.distplot,"Unnamed: 0");
g.add_legend();

In [None]:
df.loc[(df["Unnamed: 0"]>200000) & (df["Unnamed: 0"]<300000), target].value_counts()

There is actually some significant data leakage. Usually this arises because of how data was collected. As I mentioned earlier that additional samples of `age_group class 1`have been added, this seems to be a result of that. Since the organizers have been pretty careless with creating dataset, `userId` may turn out to be important too. 

In [None]:
fig , axs = plt.subplots(1,2)
sns.distplot(df["Unnamed: 0"], ax=axs[0])
axs[0].set_xlabel("train set")
sns.distplot(test["Unnamed: 0"], ax=axs[1])
axs[1].set_xlabel("test set");
plt.tight_layout();

Both train and test set have the same distribution. This graph shows how the train-test split was made on the combined dataset. And also any data leak patterns present on the train set, will also be valid on test set.

In [None]:
fig , axs = plt.subplots(2,4, figsize=(12,8))
for i in range(1,5):
    sns.distplot(df.loc[df[target] == i, 'Unnamed: 0'], ax=axs[0,i-1], norm_hist=True)
    axs[0,i-1].set_xlabel(f"age_group={i}")
for i in range(1,5):
    sns.distplot(df.loc[df[target] == i, 'userId'], ax=axs[1,i-1], norm_hist=True)
    axs[1,i-1].set_xlabel(f"age_group={i}")
axs[0,0].set_ylabel("Unnamed: 0")
axs[1,0].set_ylabel("userId")
plt.tight_layout()

In [None]:
g = sns.FacetGrid(df,col=target)
g.map(sns.scatterplot,"Unnamed: 0", "userId");

In [None]:
plt.figure( figsize=(12,12))
sns.scatterplot(x="Unnamed: 0", y="userId", hue=target, data=df,);

:eyes:

In [None]:
fig, axs= plt.subplots(1,2, figsize=(12,6))
sns.scatterplot(x="Unnamed: 0", y="userId",  data=df, ax=axs[0]);
axs[0].set_xlabel("train data")
sns.scatterplot(x="Unnamed: 0", y="userId",  data=test, ax=axs[1]);
axs[1].set_xlabel("test data");

In [None]:
df.loc[:,"is_test"] = 0
test.loc[:,"is_test"] = 1

combined = pd.concat([df, test])

plt.figure( figsize=(6,6))
sns.scatterplot(x="Unnamed: 0", y="userId", hue="is_test", data=combined.sample(frac=0.2));

The `test` set has been randomly sampled from the `combined` dataset. So all the occurances of `age_group` clasees can be expected to be expected to be in the same ratio as `train` set.

In [None]:
test.drop("is_test",axis=1,inplace=True)
df.drop("is_test",axis=1,inplace=True)

X = df.loc[:,['Unnamed: 0', 'userId']].values
y = df[target].values
test_X = test.loc[:,['Unnamed: 0', 'userId']].values

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

clf_alg = DecisionTreeClassifier(class_weight="balanced")

parameters = {
    "criterion" : ['gini', 'entropy'],
    "min_samples_leaf" : list(range(1,10))
    }
f1_scorer = sklearn.metrics.make_scorer(f1_score,average='weighted')
search = GridSearchCV(clf_alg, parameters, scoring=f1_scorer, cv=10, verbose=1, n_jobs=-1)

In [None]:
search.fit(X,y)
print(f"best score obtained was: {search.best_score_}")
print(f"best estimator obtained was: {search.best_estimator_}")

In [None]:
search.best_params_

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.1, stratify=y)

clf = DecisionTreeClassifier(class_weight="balanced",**search.best_params_)

clf.fit(train_x, train_y)
valid_preds = clf.predict(valid_x)

print(f"weighted F1 score on validation set is: {f1_score(valid_y, valid_preds, average='weighted')}")
confusion_matrix(valid_y, valid_preds)

In [None]:
clf = DecisionTreeClassifier(class_weight="balanced",**search.best_params_)
clf.fit(X, y)

test_preds = clf.predict(test_X)
print(np.unique(test_preds, return_counts=True))

sub_df = pd.read_csv("/kaggle/input/trell-social-media-usage-data/sample_submission.csv")
sub_df.loc[:,'prediction'] = test_preds
sub_df.to_csv("dt_sumission.csv", index=False)