In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

## Load data

In [None]:
import pandas as pd
from sklearn.preprocessing import Normalizer, StandardScaler
norm = Normalizer()

end_df = pd.read_csv("end_df.csv")
posts_companies_df = pd.read_csv("posts_companies_df.csv")

end_df["abs_growth_norm"] = norm.fit_transform(end_df[["abs_growth"]])
end_df["abs_growth_log"] = np.log(end_df["abs_growth"])
end_df["abs_growth_root"] = end_df["abs_growth"]**(1/2)

posts_companies_df["abs_growth_norm"] = norm.fit_transform(posts_companies_df[["abs_growth"]])
posts_companies_df["abs_growth_log"] = np.log(posts_companies_df["abs_growth"])
posts_companies_df["abs_growth_root"] = posts_companies_df["abs_growth"]**(1/2)

In [None]:
end_df.abs_growth.value_counts()

## Checking assumptions

### Assumption 1: Linearity between dv and iv

#### Check on the whole dataframe, i.e. including companies that did not post any updates

In [None]:
p = sns.relplot(x="nb_trans_words", y="abs_growth", data=end_df)
plt.title('Linear Relation')
p.set(ylim=(0,50000), xlim=(0,350))

#### Whole dataframe only with standardized dv

In [None]:
p = sns.relplot(x="nb_trans_words", y="abs_growth_stand", data=end_df)
plt.title('Linear Relation')
p.set(ylim=(0,1), xlim=(0,350))

#### Check assumption 1 on dataset contaning only companies that posted updates.

In [None]:
p = sns.relplot(x="nb_trans_words", y="abs_growth", data=posts_companies_df)
plt.title('Linear Relation')
p.set(ylim=(0,50000), xlim=(0,350))

#### Check assumption 1 on dataset conating only companies that posted updates with standardized dv

In [None]:
p = sns.relplot(x="nb_trans_words", y="abs_growth_stand", data=posts_companies_df)
plt.title('Linear Relation')
p.set(ylim=(-0.2,1), xlim=(0,350))

### Assumption 2: There are no influential cases biasing the model

#### Check assumption 2 on regression 1

In [None]:
regression1 = smf.ols('abs_growth_stand ~ nb_trans_words+ nb_words+ nb_posts+ nb_weeks_active+ goal', data=end_df).fit()
print(regression1.summary())

#### Check assumption 2 on regression 2 and 3

In [None]:
regression2 = smf.ols('abs_growth_stand ~ nb_neg_sentiment + nb_trans_words:nb_neg_sentiment + nb_pos_sentiment + nb_trans_words:nb_pos_sentiment + nb_trans_words+ nb_words+ nb_posts+ nb_weeks_active+ goal', data=posts_companies_df).fit()
print(regression2.summary())

In [None]:
infl1 = regression1.get_influence()
sm_fr1 = infl1.summary_frame()
index1 = sm_fr1[sm_fr1['cooks_d']>1].index
index1

In [None]:
infl2 = regression2.get_influence()
sm_fr2 = infl2.summary_frame()
index2 = sm_fr2[sm_fr2['cooks_d']>1].index
index2

### remove records

In [None]:
end_df_no_outliers = end_df.drop(index1)
posts_no_outliers = posts_companies_df.drop(index2)

### Redo regressions with new data

In [None]:
regression1 = smf.ols('abs_growth_stand ~ nb_neg_sentiment + nb_trans_words:nb_neg_sentiment + nb_pos_sentiment + nb_trans_words:nb_pos_sentiment + nb_trans_words+ nb_words+ nb_posts+ nb_weeks_active+ goal', data=end_df_no_outliers).fit()
print(regression1.summary())

In [None]:
regression2 = smf.ols('abs_growth_stand ~ nb_neg_sentiment + nb_trans_words:nb_neg_sentiment + nb_pos_sentiment + nb_trans_words:nb_pos_sentiment + nb_trans_words+ nb_words+ nb_posts+ nb_weeks_active+ goal', data=posts_no_outliers).fit()
print(regression2.summary()) 

### Assumption 3: Residuals are normally distributed

#### Check assumption 3 on regression 1

In [None]:
import numpy as np 
import pylab 
import scipy.stats as stats

residuals_end_df = regression1.resid 
fig, ax = plt.subplots(figsize=(6, 3))
fig = stats.probplot(residuals_end_df, dist="norm", plot=pylab)

# ax.set_yscale('log')
ax.set_ylim(bottom=-10, top=20)
sns.despine()

#### Check assumption 3 on regression 2 and 3

In [None]:
import numpy as np 
import pylab 
import scipy.stats as stats

residuals_post_df = regression2.resid 
fig, ax = plt.subplots(figsize=(6, 3))
fig = stats.probplot(residuals_post_df, dist="norm", plot=pylab)

# ax.set_yscale('log')
ax.set_ylim(bottom=-10, top=20)
sns.despine()

### Assumption 4: Homoscedasticity

#### Check assumption 4 on regression 1

In [None]:
pred_val_end_df = regression1.fittedvalues.copy()
fig, ax = plt.subplots(figsize=(6,2.5))
ax.set_xlim(-5, 10)
plt.xlabel("Residual")
plt.ylabel("Prediction")
ax.set_ylim(-1, 5)

_ = ax.scatter(residuals_end_df, pred_val_end_df)

#### Check assumption 4 on regression 2 and 3

In [None]:
pred_val_post_df = regression2.fittedvalues.copy()
fig, ax = plt.subplots(figsize=(6,2.5))
ax.set_xlim(-5, 10)
plt.xlabel("Residual")
plt.ylabel("Prediction")
ax.set_ylim(-1, 5)

_ = ax.scatter(residuals_post_df, pred_val_post_df)

### Assumption 5: No multicolinearity

### Assumption 6: DV must be normally distributed

#### Check assumption 6 on regression 1

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
fig = stats.probplot(end_df_no_outliers.abs_growth_stand, dist="norm", plot=pylab)

# ax.set_yscale('log')
ax.set_ylim(bottom=-10, top=10)
sns.despine()


In [None]:
import statsmodels
statsmodels.stats.stattools.robust_skewness(end_df_no_outliers.abs_growth_stand)

In [None]:
from scipy.stats import kurtosis
kurtosis(end_df_no_outliers.abs_growth_stand)

#### Check assumption 6 on regression 2 and 3

In [None]:
import statsmodels
statsmodels.stats.stattools.robust_skewness(posts_no_outliers.abs_growth_stand)

In [None]:
from scipy.stats import kurtosis
kurtosis(posts_no_outliers.abs_growth_stand)

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
fig = stats.probplot(posts_no_outliers.abs_growth_stand, dist="norm", plot=pylab)

# ax.set_yscale('log')
ax.set_ylim(bottom=-10, top=10)
sns.despine()

In [None]:
posts_companies_df.describe()

In [None]:
end_df.describe()