In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def df_snapshot(df):
    display(df)
    return df

In [None]:
import matplotlib

matplotlib.rcParams['font.weight'] = "light"
matplotlib.rcParams['font.style'] = "normal"
matplotlib.rcParams['ytick.color'] = "#434343"
matplotlib.rcParams['xtick.color'] = "#434343"
matplotlib.rcParams['text.color'] = "#434343"
matplotlib.rcParams['axes.spines.left'] = False
matplotlib.rcParams['axes.spines.bottom'] = False
matplotlib.rcParams['axes.spines.top'] = False
matplotlib.rcParams['axes.spines.right'] = False
matplotlib.rcParams['axes.titleweight'] = 'semibold'
matplotlib.rcParams['axes.axisbelow'] = True
matplotlib.rcParams['xtick.bottom'] = False
matplotlib.rcParams['ytick.left'] = False
matplotlib.rcParams['grid.color'] = 'grey'
matplotlib.rcParams['grid.alpha'] = 0.5


In [None]:
def data_processing(df):
    df = (
        pd.get_dummies(df, columns=['tier']).drop(columns=['Unnamed: 0', ])
        .assign(
             gender = lambda x: np.where(x['gender'] == 1, 1, 0),
             tier_1_gender = lambda x: np.where(
                (x.tier_1 == 1) & (x.gender == 1),
                1,
                0
            )
        )
    )
    
    return df

In [None]:
dirname = '/kaggle/input/trell-social-media-usage-data/'
test = pd.read_csv(os.path.join(dirname,'test_age_dataset.csv'))
train = pd.read_csv(os.path.join(dirname,'train_age_dataset.csv'))

test = data_processing(test)
train = data_processing(train)

In [None]:
train.info()

In [None]:
# How are 
ax = (
    train
    .groupby(['age_group'])
    .agg(user_count = ('userId','count'))
    .assign(
        percentage = lambda x: x['user_count'] /train.userId.count()
    )
    .pipe(df_snapshot)
    [['percentage']].T
    .plot(
        kind = 'barh',
        stacked=True
    )
)


In [None]:
numeric_feature_cols = [x for x in train.columns if x not in ['gender', 'tier', 'userId' ,'age_group', 'Unnamed: 0']]
cat_feature_cols = ['gender', 'tier_1', 'tier_2', 'tier_3', 'tier_1_gender']
target_col = ['age_group']


# What columns have strong correlations with age_group?

Creations appears to have a strong correlation with the age group also number of words per action.

In [None]:
import scipy.stats 

corrs = []
for col in numeric_feature_cols:
    corr = scipy.stats.spearmanr(train['age_group'], train[col])
    corrs.append({
        'feature': col,
        'correlation': corr[0],
        'correlation_p_value': corr[1]
    })
    
pd.DataFrame(corrs).sort_values('correlation')

In [None]:
train.groupby('age_group')[['creations', 'avgt2', 'number_of_words_per_action']].agg(['mean', 'median'])

In [None]:
strong_correlations = ['creations', 'avgt2', 'number_of_words_per_action', 'tier_1_gender']

# To set a base line lets just look at random forest based on the 'stronger correlated' features.

In [None]:
X=train[strong_correlations]
Y=train[target_col]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42)

rf_model=RandomForestClassifier(n_estimators=20, max_depth=15, random_state=42)
rf_model.fit(X_train, np.ravel(Y_train))


print('Random Forest:')
print('Traning Model accruracy: {:.2%}'.format(rf_model.score(X_train,Y_train)))
print('Test Model accruracy: {:.2%}'.format(rf_model.score(X_test,Y_test[target_col[0]])))

In [None]:
Y_predict = rf_model.predict(X_test)
prediction_comparison = (
    pd.DataFrame({
        'prediction': Y_predict, 'age_group': Y_test['age_group']
    })
    .assign(correct = lambda x: np.where(x['prediction']==x['age_group'], 1, 0))
)

In [None]:
ax = (
    prediction_comparison
    .groupby(['age_group', 'prediction'])
    .agg(total_predictions = ('correct','count'))
    .join(prediction_comparison.groupby(['age_group']).agg(total=('correct','count')))
    .assign(
        percentage_in_age_group = lambda x: x['total_predictions'] /x['total']
    )
    .reset_index()
    .pivot(
        index='age_group',
        columns = 'prediction',
        values = 'percentage_in_age_group'
    )
    .pipe(df_snapshot)
    .plot(
        kind='barh',
        stacked=True
    )
)


It seems that age group 3 is where the predictions are going wrong the most, lets dig into features that better predict group 3.

# Preparing a submission

In [None]:
test['prediction'] = rf_model.predict(test[strong_correlations])

In [None]:
test[['userId', 'prediction']].to_csv('submission.csv')