# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import skew,kurtosis,ttest_ind

# Getting Data 

In [None]:
df=pd.read_csv("/kaggle/input/stumbleupon/train.tsv",sep='\t')
df.head(5)

# Cleaning Data

In [None]:
def DataCleaning(d):
    
    data=d.copy()
    data['is_news']=data['is_news'].str.replace('?','0').astype(int)
    data['alchemy_category']=data['alchemy_category'].str.replace('weather','?')
    data['alchemy_category']=data['alchemy_category'].str.replace('unknown','?')
    data=data.drop(columns=['url','boilerplate','framebased','alchemy_category_score'])
    return data

In [None]:
data=DataCleaning(df)

# Data Overview

In [None]:
data.sample(5)

In [None]:
# Checking if Categories differences can impact on the label
(data.groupby('alchemy_category').label.agg(['count','mean']).sort_values(by='mean',ascending=False)).round(2)

In [None]:
# Checking if is_news or not can impact on the label
(data.groupby('is_news').label.agg(['count','mean']).sort_values(by='mean',ascending=False)).round(2)

In [None]:
data.corr()[['label']].abs().T

In [None]:
sns.heatmap(data.corr())

In [None]:
data['news_front_page'].value_counts()

# Feature Selection (wrapper method)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
FT=data.copy()
cols = FT.columns.tolist()
for column in cols:
    if FT[column].dtype == 'object':
        FT[column] = le.fit_transform(FT[column])

In [None]:
X=FT.iloc[:,1:-1]
y=FT['label']

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.ensemble import RandomForestClassifier as rfc
from mlxtend.feature_selection import ExhaustiveFeatureSelector

In [None]:
feature_names=tuple(X.columns)
feature_names

In [None]:
sfs1 = SFS(#knn(n_neighbors=5),
           rfc(n_estimators = 10, criterion = 'entropy'),
           #LGR(max_iter=1000),
           k_features='best', 
           forward=True, 
           floating=False, 
           verbose=2,
           #scoring = 'neg_mean_squared_error',  # sklearn regressors
           scoring='accuracy',  # sklearn classifiers
           cv=0)

sfs1 = sfs1.fit(X, y,custom_feature_names=feature_names)

In [None]:
sfs1.subsets_

In [None]:
sfs1.get_metric_dict()

In [None]:
sfs1.k_feature_names_, sfs1.k_feature_idx_

# Scaling Data

In [None]:
def features_transform(d):
    
    train=d[['alchemy_category','avglinksize',
    'commonlinkratio_1',
   'commonlinkratio_2',
   'commonlinkratio_3',
   'html_ratio',
   'is_news',
   'lengthyLinkDomain',
   'news_front_page',
   'non_markup_alphanum_characters',
   'numberOfLinks',
   'parametrizedLinkRatio',
   'spelling_errors_ratio']]
    
    Cat_Ft=train.select_dtypes(include=[np.object])
    Num_Ft=train.select_dtypes(include=[np.number])

    from sklearn.preprocessing import OneHotEncoder

    one_hot_encoder = OneHotEncoder(drop='first')
    Cat_Features = one_hot_encoder.fit_transform(Cat_Ft).todense()
    Cat_Features=pd.DataFrame(Cat_Features, columns=one_hot_encoder.get_feature_names())
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    Num_Features = scaler.fit_transform(Num_Ft)
    Num_Features=pd.DataFrame(Num_Features, columns=Num_Ft.columns)
    features=pd.concat([Num_Features,Cat_Features],axis=1)
    
    return features

# Model Training

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(data, test_size=0.3, random_state=0)

In [None]:
X_train = features_transform(df_train)
y_train = df_train.label

X_test = features_transform(df_test)
y_test = df_test.label

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.linear_model import LogisticRegression as LGR
#from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

model = rfc(n_estimators = 10)
baseline = DummyClassifier(strategy='most_frequent')

model.fit(X_train, y_train)
baseline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score

print(classification_report(y_test, model.predict(X_test)))

In [None]:
y_pred=model.predict(X_test)
#print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

## Submission

In [None]:
X_train = features_transform(data)
y_train = data.label

model.fit(X_train, y_train)

In [None]:
dd=pd.read_csv("/kaggle/input/stumbleupon/test.tsv",sep='\t')
df_test_cleaned=DataCleaning(dd)

X_test=features_transform(df_test_cleaned)

#df_test_cleaned
sumbission_df=df_test_cleaned[['urlid']].copy()
sumbission_df['label']=model.predict(X_test)
sumbission_df

In [None]:
sumbission_df.to_csv('StumbleUpon_Evergreen_Classification.csv',index=False)