# Media company

In [None]:
import numpy as np
import pandas as pd

# Problem: There is fall in the veiws of the show, media company is interested in to find the reason behind it, in order to cure the problem.

In [None]:
df = pd.read_csv('../input/media-company/mediacompany.csv')
df.drop('Unnamed: 7',axis=1,inplace=True)
df["Date"]= pd.to_datetime(df["Date"])

df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum() # checking null values

In [None]:
df.duplicated().sum()# checking duplicated values

# Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.pairplot(df.drop('Date',axis=1)) # pairplot of integer varaibles
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.drop('Date',axis=1).corr(),annot=True)
plt.show()

From the above table it is clear that , Views_show is highly correlated with 
Views_platform, visitors, ad_impression

Views platform and visitors shows high correlation 

In [None]:
plt.figure(figsize=(9,6))
sns.distplot(df['Views_show'],rug=True,kde_kws={'color':'red'},rug_kws={'color':'g'})
plt.show()

In [None]:
from scipy.stats import shapiro
shapiro(df['Views_show'])
# since p value of the shapiro test is almost zero, we can say the target variable does not follow normality

In [None]:
from scipy.stats import norm
plt.figure(figsize=(9,6))
sns.distplot(df['Views_show'],fit=norm)
# However the target variable does not follow normal distribution as it fails in shapiro test, but we are good to go as the traget variable is approximately
#normal

In [None]:
y=df['Views_show']
num_col= df.drop(["Date",'Views_show'],axis=1).columns
for col in num_col:
  plt.figure(figsize=(10,6))
  sns.regplot(y=y,x=df[col],line_kws={'color':'r'},lowess=True)
  plt.show()

From the above graph we can see the target variable shows some increasing trend
with top 3 variable, the red curve indicate about some non linear relationship
which we will take further for analysis

# Checking multicollinearity of the data

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [None]:
y= df['Views_show']
X =df.drop(["Date",'Views_show'],axis=1)
vif_score = [ vif(X.values,i) for i in range(X.shape[1])]

In [None]:
pd.DataFrame(vif_score,columns=['values'],index=X.columns)

The vif value of visitors and Views_platform are very high

In [None]:
plt.figure(figsize=(9,6))
sns.lineplot(y=df['Views_show'],x=df['Date'].dt.month)
plt.title("Views vs month")
plt.show()

In [None]:
plt.figure(figsize=(9,6))
for col in num_col:
    plt.figure(figsize=(12,5))
    sns.lineplot(y=df[col],x=df['Date'].dt.month)
    plt.title("Month vs "+col)
    plt.show()

From the above vif values it is clear that visitors and views_platform have
very high vif values and can effect the working of model. Will will deal with this problem while modeling the data.

# Modelling

In [None]:
y= df['Views_show']
X =df.drop(["Date",'Views_show'],axis=1)

In [None]:
import statsmodels.api as sm # first full run model
Xc= sm.add_constant(X)
model = sm.OLS(y,Xc).fit()
model.summary()

In [None]:
pd.DataFrame(vif_score,columns=['values'],index=X.columns)

Removing visitors as visitors and views platform have high vif and highly correlated

Removing Cricket mathc india as p value is insignificant for the model 

In [None]:
import statsmodels.api as sm # Running model again after removing Crickedt match and visitors 
y= df['Views_show']
X =df.drop(["Date",'Views_show','Visitors','Cricket_match_india'],axis=1)
Xc= sm.add_constant(X)
model = sm.OLS(y,Xc).fit()
model.summary()

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split ,cross_val_score
y= df['Views_show']
X =df.drop(["Date",'Views_show','Visitors','Cricket_match_india'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
Xc_train= sm.add_constant(X_train)
model_train= sm.OLS(y_train,Xc_train).fit()
model_train.summary() # train model summary 

In [None]:
Xc_test= sm.add_constant(X_test) 
model_test= sm.OLS(y_test,Xc_test).fit()
model_test.summary() # test model summary

Conclusion

From the above train and test model run, we are clear that 65% of target variable is explained by
the variable viewsplatform , ad_impresson and character_a

The media company viewership is mostly governed by Views_platfor Ad_impression and character_a, however
the effect of cricket_match is not so strong. The company need to focus more on viewsplatform and character A as these varaible possess high coeficient value. 