In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor

from matplotlib import pyplot as plt

from sklearn import linear_model
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import RFECV

import seaborn as sns
import pickle

In [2]:
with open('dropped_news_df', 'rb') as handle:
    dropped_news = pickle.load(handle)

In [3]:
dropped_news.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_non_stop_words,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,...,avg_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,day_of_week,LDA,lda,channels
0,12.0,219.0,1.0,4.0,2.0,1.0,0.0,4.680365,5.0,0.0,...,-0.350000,0.500000,-0.187500,0.000000,0.187500,593,Monday,,,Entertainment
1,9.0,255.0,1.0,3.0,1.0,1.0,0.0,4.913725,4.0,0.0,...,-0.118750,0.000000,0.000000,0.500000,0.000000,711,Monday,,,Business
2,9.0,211.0,1.0,3.0,1.0,1.0,0.0,4.393365,6.0,0.0,...,-0.466667,0.000000,0.000000,0.500000,0.000000,1500,Monday,,,Business
3,9.0,531.0,1.0,9.0,0.0,1.0,0.0,4.404896,7.0,0.0,...,-0.369697,0.000000,0.000000,0.500000,0.000000,1200,Monday,,,Entertainment
4,13.0,1072.0,1.0,19.0,19.0,20.0,0.0,4.682836,7.0,0.0,...,-0.220192,0.454545,0.136364,0.045455,0.136364,505,Monday,,,Tech
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,11.0,346.0,1.0,9.0,7.0,1.0,1.0,4.523121,8.0,0.0,...,-0.260000,0.100000,0.000000,0.400000,0.000000,1800,Wednesday,,,Tech
39640,12.0,328.0,1.0,9.0,7.0,3.0,48.0,4.405488,7.0,0.0,...,-0.211111,0.300000,1.000000,0.200000,1.000000,1900,Wednesday,,,Social Media
39641,10.0,442.0,1.0,24.0,1.0,12.0,1.0,5.076923,8.0,0.0,...,-0.356439,0.454545,0.136364,0.045455,0.136364,1900,Wednesday,,,
39642,6.0,682.0,1.0,10.0,1.0,1.0,0.0,4.975073,5.0,0.0,...,-0.205246,0.000000,0.000000,0.500000,0.000000,1100,Wednesday,,,World


# Dropping Column from Categorical Variables

In [16]:
dropped_news.drop(columns=['data_channel_is_lifestyle', 'weekday_is_monday', 'LDA_00'], inplace=True)

KeyError: "['data_channel_is_lifestyle' 'weekday_is_monday' 'LDA_00'] not found in axis"

In [17]:
with open('dropped_news_df_model', 'wb') as handle:
    pickle.dump(dropped_news, handle, protocol=pickle.HIGHEST_PROTOCOL)

# OLS Models for Categorical Variables

In [6]:
# ols for days of the week
day_ols = ols(formula='shares~weekday_is_tuesday+weekday_is_wednesday+weekday_is_thursday+weekday_is_friday+weekday_is_saturday+weekday_is_sunday',
              data=dropped_news).fit()
day_ols.summary()

0,1,2,3
Dep. Variable:,shares,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,3.187
Date:,"Fri, 08 May 2020",Prob (F-statistic):,0.00397
Time:,02:23:24,Log-Likelihood:,-414490.0
No. Observations:,38463,AIC:,829000.0
Df Residuals:,38456,BIC:,829000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3589.2986,144.003,24.925,0.000,3307.048,3871.549
weekday_is_tuesday,-434.7610,198.619,-2.189,0.029,-824.060,-45.462
weekday_is_wednesday,-303.1792,198.397,-1.528,0.126,-692.042,85.684
weekday_is_thursday,-445.6283,199.413,-2.235,0.025,-836.482,-54.774
weekday_is_friday,-355.8641,212.055,-1.678,0.093,-771.498,59.770
weekday_is_saturday,466.0003,278.174,1.675,0.094,-79.227,1011.228
weekday_is_sunday,117.9920,266.910,0.442,0.658,-405.158,641.142

0,1,2,3
Omnibus:,106052.442,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5857350663.113
Skew:,34.963,Prob(JB):,0.0
Kurtosis:,1913.486,Cond. No.,7.54


In [None]:
# ols for the LDA variables
lda_ols = ols(formula='shares~LDA_01+LDA_02+LDA_03+LDA_04', data=dropped_news).fit()
lda_ols.summary()

In [None]:
# token in this dataset represents words, so this is an ols for different analysis of token
token_ols = ols(formula='shares~n_tokens_title+n_tokens_content+average_token_length',
                data=dropped_news).fit()
token_ols.summary()

In [None]:
# ols for affect of usage of negative/positive words on the shares
pos_or_neg_ols = ols(formula='shares~global_rate_positive_words+global_rate_negative_words+avg_positive_polarity+avg_negative_polarity',
                     data=dropped_news).fit()
pos_or_neg_ols.summary()

In [None]:
# ols for the way the title was perceived
title_subj_pol_ols = ols(formula='shares~title_subjectivity+title_sentiment_polarity+abs_title_subjectivity+abs_title_sentiment_polarity',
                         data=dropped_news).fit()
title_subj_pol_ols.summary()

In [None]:
# ols for non-text usage in the articles
visuals_ols = ols(formula='shares~num_hrefs+num_self_hrefs+num_imgs+num_videos', data=dropped_news).fit()
visuals_ols.summary()

In [None]:
# ols for the genre of the article
data_channel_ols = ols(formula='shares~data_channel_is_entertainment+data_channel_is_bus+data_channel_is_socmed+data_channel_is_tech+data_channel_is_world',
                       data=dropped_news).fit()
data_channel_ols.summary()

In [8]:
# lil_bit_everything_ols = (formula='shares~weekday_is_wednesday+LDA_01+average_token_length+global_rate_negative_words+title_sentiment_polarity+num_imgs+data_channel_is_world', data=dropped_news).fit()
# lil_bit_everything_ols.summary()
'''WHY IS THIS NOT WORKING'''

'WHY IS THIS NOT WORKING'

# ANOVA Tests for Categorical Variables

In [None]:
day_anova = sm.stats.anova_lm(day_ols, typ=2)
day_anova

In [None]:
lda_anova = sm.stats.anova_lm(lda_ols, typ=2)
lda_anova

In [None]:
token_anova = sm.stats.anova_lm(token_ols, typ=2)
token_anova

In [None]:
pos_or_neg_anova = sm.stats.anova_lm(pos_or_neg_ols, typ=2)
pos_or_neg_anova

In [None]:
title_subj_pol_anova = sm.stats.anova_lm(title_subj_pol_ols, typ=2)
title_subj_pol_anova

In [None]:
visuals_anova = sm.stats.anova_lm(visuals_ols, typ=2)
visuals_anova

In [None]:
data_channel_anova = sm.stats.anova_lm(data_channel_ols, typ=2)
data_channel_anova

In [None]:
# lil_bit_everything_anova = sm.stats.anova_lm(lil_bit_everything_ols, typ=2)
# lil_bit_everything_anova

# Making All 0 Values into 0.000000001 Values

In [13]:
dropped_news.replace(to_replace=0, value=0.000001, inplace=True)

# Creating Target and Feature Variables

In [12]:
target = np.log(dropped_news['shares'])
features = dropped_news.drop(columns='shares')

# Setting up Train/Test Split to View Correlation

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=23,test_size=0.2)  

# Correlation Evaluation

In [None]:
dropped_news.corr()

In [None]:
sns.set(style="white")

# Compute the correlation matrix
corr = X_train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Variance Inflation Factor Method

In [15]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif["features"] = X_train.columns
vif.round(5)
# ask about why my VIF changed after I took out high coefficients before

TypeError: unsupported operand type(s) for -: 'str' and 'str'