In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/internet-articles-data-with-users-engagement/articles_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
import seaborn as sns
sns.heatmap(df.isnull())

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

In [None]:
missing_value_df

I will drop columns that have Missing Percentage more than 5 and others i would drop empty values

In [None]:
df = df.drop(['author','url_to_image','content'],axis=1)

In [None]:
df = df.dropna()

In [None]:
df.shape

In [None]:
df.reset_index(inplace=True)

In [None]:
df.head()

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

In [None]:
missing_value_df

Now our dataset is ready but we would like to see if Title is having a positive ore negative sentiment on reader , so we would do some sentiment analysis on Title , Title is the very first thing some one reads

# Using Vader Lexicon

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()


In [None]:
df['scores'] = df['title'].apply(lambda Review  : sid.polarity_scores(Review ))

In [None]:
df['compound'] = df['scores'].apply(lambda score_dict : score_dict['compound'])
df['title_sentiment'] = df['compound'].apply(lambda c : 'pos' if c>=0 else 'neg')
df.head()

In [None]:
df['title_sentiment'].value_counts()

Dealing With Time Stamp

In [None]:
df['published_at'] = pd.to_datetime(df.published_at)

In [None]:
df.head()

In [None]:
df['date'] = df['published_at'].dt.date
df['hour'] = df['published_at'].dt.hour
df['minute'] = df['published_at'].dt.minute
df['second'] = df['published_at'].dt.second

In [None]:
df.head()

LET US REMOVE COLUMNS THAT ARE NOT OF MUCH USE

In [None]:
df_new = df.copy()

In [None]:
df_new.columns

In [None]:
df_new = df_new.drop(['Unnamed: 0','url','scores','published_at','compound','minute','second'],axis=1)

In [None]:
df_new.head()

In [None]:
df_new['engagement_reaction_count'].value_counts()

In [None]:
df_new.shape

In [None]:
import seaborn as sns

In [None]:
sns.catplot(x='top_article',y='hour',hue='title_sentiment',kind='bar',data=df_new)

In [None]:
sns.catplot(y='engagement_comment_count',x='top_article',hue='title_sentiment',data=df_new)

In [None]:
sns.catplot(y='engagement_comment_count',x='top_article',hue='title_sentiment',data=df_new)

Converting DATA to feed into model

In [None]:
df_new.head()

In [None]:
df_new['source_name'].value_counts()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df_new['source_name'] = label_encoder.fit_transform(df_new['source_name'])

In [None]:
df_new['title_sentiment'] = label_encoder.fit_transform(df_new['title_sentiment'])

In [None]:
df_new.head()

In [None]:
x = df_new[['source_name','engagement_reaction_count','engagement_comment_count','engagement_comment_plugin_count','engagement_share_count','title_sentiment','hour']]
y = df_new['top_article']

In [None]:
x.head()

MODEL BUILDING

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=101,stratify=y)

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(x_train,y_train)
log.score(x_test,y_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix


In [None]:
y_pred = log.predict(x_test)
cf_matrix = confusion_matrix(y_test,y_pred)


In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

# SVM

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train,y_train)
svc.score(x_test,y_test)

In [None]:
y_pred = svc.predict(x_test)
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='coolwarm')

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)

In [None]:
y_pred = knn.predict(x_test)
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='PiYG')

# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)
tree.score(x_test,y_test)

In [None]:
y_pred = tree.predict(x_test)
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='PuOr')

# ENSEMBLE METHODS

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=200)
forest.fit(x_train,y_train)
forest.score(x_test,y_test)

In [None]:
y_pred = forest.predict(x_test)
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Spectral')

FEATURE IMPORTANCES ACCORDING TO RANDOM FOREST ALGORITHM

In [None]:
forest.feature_importances_

In [None]:
x_train.head(1)

TOP 3 most important features for our model to predict if article would be TOP ARTICLE OR NOT

1.SOURCE NAME

2.AT WHICH HOUR ARTICLE WAS PUBLISHED

3.TOTAL SHARES FOR ARTICLE


# STACKING ALL ALGORITHMS

In [None]:
#defining baseline models
level0 = list()
level0.append(('log',LogisticRegression()))
level0.append(('svc',SVC()))
level0.append(('knn',KNeighborsClassifier(n_neighbors=30)))
level0.append(('tree',DecisionTreeClassifier()))
level0.append(('forest',RandomForestClassifier()))


In [None]:
from sklearn.ensemble import StackingClassifier
# define meta learner model
level1 = LogisticRegression()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

In [None]:
model.fit(x_train,y_train)

In [None]:
model.score(x_test,y_test)

In [None]:
y_pred = model.predict(x_test)
cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

Let's try and see if unsupervised learning could classify it or not

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit(x_train)

In [None]:
y_pred = km.predict(x_test)

In [None]:

cf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='coolwarm')

In [None]:
classification_report(y_test,y_pred)