In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Analysis

In [None]:
raw_df = pd.read_csv('/kaggle/input/stumbleupon/train.tsv', sep='\t')

df = raw_df.copy()
del df['framebased']
del df['urlid']
del df['alchemy_category_score']
df['is_news'] = df['is_news'].str.replace('?', '0').astype(int)


In [None]:
df.sample(5).T

In [None]:
df.label.mean()

In [None]:
df.alchemy_category.value_counts()

In [None]:
label_mean_by_category = df.groupby('alchemy_category').label.mean().sort_values()
label_mean_by_category

In [None]:
plt.figure(figsize=(12, 8))
label_mean_by_category.plot.barh()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr().abs())

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr().abs()[['label']].sort_values('label'))

In [None]:
sns.violinplot(x='label', y="linkwordscore", data=df)

In [None]:
features_df = df.drop('label', axis=1)
num_features = features_df.select_dtypes(np.number)
num_features.columns

In [None]:
num_features.describe()

In [None]:
cat_features = df.select_dtypes(include=[np.object])
cat_features =  cat_features[['alchemy_category', 'news_front_page']]
cat_features.sample(5)

In [None]:
pd.get_dummies(cat_features)

# Modeling

In [None]:
# More robust way to do feature pre-processing.
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

features_df = df.drop('label', axis=1)
FEATURE_COLUMNS = features_df.columns
NUM_FEATURES = features_df.select_dtypes(include=[np.number]).columns
CAT_FEATURES = ['alchemy_category', 'news_front_page']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
  transformers=[
      ('num', numeric_transformer, NUM_FEATURES),
      ('cat', categorical_transformer, CAT_FEATURES)
  ])

In [None]:
features_df = preprocessor.fit_transform(df.drop('label', axis=1))
features_df.shape

In [None]:
#features_df = pd.concat([num_features, pd.get_dummies(cat_features)], axis=1)

In [None]:
#Normalization 
#features_df = (features_df - features_df.mean())/ features_df.std()
#features_df.columns
#features_df.sample(5)

In [None]:
target = df['label']
target.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features_df, target, test_size=0.25)

In [None]:
from sklearn.dummy import DummyClassifier 

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

baseline = DummyClassifier(strategy='most_frequent')
#model = LogisticRegression()
model = RandomForestClassifier()


In [None]:
baseline.fit(x_train, y_train)
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report 

baseline_predictions = baseline.predict(x_test)
model_predictions = model.predict(x_test)


In [None]:
print(classification_report(y_test, baseline_predictions))

In [None]:
print(classification_report(y_test, model_predictions))

## Submition

In [None]:
test_df = pd.read_csv('/kaggle/input/stumbleupon/test.tsv', sep='\t')
test_df['is_news'] = test_df['is_news'].str.replace('?', '0').astype(int) 

In [None]:
sub_model = RandomForestClassifier()
sub_model.fit(features_df, target)

test_features = preprocessor.transform(test_df[df.drop('label', axis=1).columns])
predictions = sub_model.predict(test_features)

In [None]:
sub_df = pd.DataFrame({'urlid' : test_df.urlid, 'label': predictions})
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)