In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# file with tab separetor 
df = pd.read_csv('../input/stumbleupon/train.tsv', sep='\t')
df.head()

# Data analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# determine the values of label
df['label'].unique()

In [None]:
# visualize the features
df.hist(figsize=(20,15))

In [None]:
# the feature framebased have a single value which is 0
df['framebased'].unique()

In [None]:
# delete the columns framebased and urlid from the dataset
df.drop('framebased', axis=1, inplace= True)
df.drop('urlid', axis=1, inplace= True)
# change the value ? by 0 in is_news features and change its type to integer
df['is_news'] = df['is_news'].str.replace('?', '0').astype(int)
df.head(3)

In [None]:
df.info()

In [None]:
df.groupby('alchemy_category').label.agg(['mean', 'count']).sort_values('count', ascending=False)

In [None]:
sns.heatmap(df.corr().abs()[['label']].sort_values('label'))

In [None]:
sns.violinplot(x='label',y='linkwordscore', data=df)

In [None]:
# delete the column label
df1 = df.drop('label', axis=1)

In [None]:
# gather the features with type number in a variable
num_feat = df1.select_dtypes(include=[np.number])
num_feat.sample(5)

In [None]:
# normalising
num_feat = (num_feat - num_feat.mean())/ num_feat.std()

In [None]:
# select columns with type object
cat_feat = df1.select_dtypes(include=[np.object])
cat_feat.sample(5)


In [None]:
df.news_front_page.value_counts()

In [None]:
df.alchemy_category_score.value_counts()

In [None]:
df.is_news.value_counts()

In [None]:
# using just the columns alchemy_category and news_front_page in the modeling
cat_feat = cat_feat[['alchemy_category','news_front_page']]
cat_feat.head()

In [None]:
# one hot encoding the change the categories features to number
pd.get_dummies(cat_feat)

In [None]:
# Concatenate the two types of features, categorical and numerical
df_f = pd.concat([num_feat,pd.get_dummies(cat_feat)], axis=1)
df_f.columns

# Modeling

In [None]:
# features pre-processing.

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

features_df = df.drop('label', axis=1)
FEATURE_COLUMNS = features_df.columns
NUM_FEATURES = features_df.select_dtypes(include=[np.number]).columns
CAT_FEATURES = ['alchemy_category', 'news_front_page']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
  transformers=[
      ('num', numeric_transformer, NUM_FEATURES),
      ('cat', categorical_transformer, CAT_FEATURES)
  ])

In [None]:
features_df = preprocessor.fit_transform(df.drop('label', axis=1))
features_df.shape

In [None]:
label = df['label']
label.shape

In [None]:
# split our dataset
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(df_f, label, test_size= 0.25)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
# using Logistic RegressionCV
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV()

In [None]:
model.fit(x_train, y_train)

In [None]:
model.predict(x_train).mean()

In [None]:
y_train.mean()

In [None]:
model_pred = model.predict(x_test)

In [None]:
# evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test, model_pred))

In [None]:
# using random forest classifier
from sklearn.ensemble import RandomForestClassifier
modelR = RandomForestClassifier()

In [None]:
modelR.fit(x_train, y_train)

In [None]:
modelR_predictions = modelR.predict(x_test)

In [None]:
# evaluation
print(classification_report(y_test, modelR_predictions))