# Exploratory analysis with scipy.stats

In [None]:
# Import of used modules
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
import scipy.stats as stats
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Descriptive statistics of data

In [None]:
df = pd.read_csv('../input/imdb-data/IMDB-Movie-Data.csv')
df.describe()

### The film with max Revenue (Millions)

In [None]:
df[df['Revenue (Millions)']==df['Revenue (Millions)'].max()]

### The film with max Metascore

In [None]:
df[df['Metascore']==df['Metascore'].max()]

### The film with max Rating and Votes

In [None]:
df[df['Rating']==df['Rating'].max()]

## Distribution of target feature (Metascore)

In [None]:
metascore_mean = df['Metascore'].mean()
metascore_std = df['Metascore'].std()

x = np.linspace(11, 100, 1001)  # Max and min of the Metascore value 1.1
y = [stats.norm.pdf(i, 58.985043, 17.194757) for i in x] # f(x) (Theoretical density function) with mean and std of Metascore

fig, ax = plt.subplots(figsize=(5,5), dpi = 80)
# Hist
ax.hist(df.Metascore, color='purple',edgecolor = 'black', density = True, label = 'Metascore hist',alpha=0.3)
# Theoretical function
ax.plot(x, y, label='f(x)', color = 'purple', linestyle = '-', lw=3)
plt.title('Metascore')
ax.legend(loc = 'upper left')
plt.ylabel('f(x)')
plt.xlabel('Score')
plt.show()

As it can be seen at the histogram, the target feature is normally distributed.

## Skewness and Kurtosis of target feature
Skewness:
$$
\gamma _{1}={\frac  {\mu _{3}}{\sigma ^{3}}}.
$$

Kurtosis:
$$
{\displaystyle \operatorname {Kurt} [X]={\frac {\mu _{4}}{\sigma ^{4}} -3}.}
$$

In [None]:
x_i = df['Metascore'].dropna().values
numerator = x_i - metascore_mean
numerator**=3
numerator /= len(x_i)
skewness = numerator/((metascore_std)**3)
print('Skewness is: ', skewness.sum())

numerator = x_i - metascore_mean
numerator**=4
numerator /= len(x_i)
kurtosis = numerator/((metascore_std)**4)
print('Kurtosis is: ', kurtosis.sum() - 3)

### Skewness and Kurtosis comments
- Skewness is < 0, so the long tail of the curve is situated on the left from the mean.
- Kurtosis is < 0, so the distribution of the feature corresponds to the "Flat-top" distribution. 
    That means that less of the values are situated closer to the mean value.

Both of the values are close to the normal distribution value 0.

## Relationships of target feature with numerical variables

In [None]:
cols = ['Rating', 'Votes', 'Revenue (Millions)', 'Runtime (Minutes)']
for i in cols:
    fig = plt.gcf()
    fig.set_size_inches(5,5)
    sns.relplot(x="Metascore", y= i, data=df)
    plt.show()

In [None]:
plt.hist(df['Votes'],label=' Votes', color = 'tab:cyan', linestyle = '-', lw=3)
plt.title('Votes')
plt.show()
plt.hist(df['Revenue (Millions)'],label='Revenue (Millions)', color = 'tab:pink', linestyle = '-', lw=3)
plt.title('Revenue (Millions)')
plt.show()

## Relationships of target feature with categorical variables
### Metascore - Genre relationship

In [None]:
# We can leave the first genre only as it is the most descriptive one
df['Main_genre'] = df['Genre'].apply(lambda x: x.split(','))
df['Main_genre'] = df['Main_genre'].apply(lambda x: x[0])
df.tail()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df.Main_genre)
df['Main_genre_le'] = le.transform(df.Main_genre)
x = sorted (df['Main_genre_le'].unique())
df = df.drop(columns=['Rank', 'Genre', 'Description', 'Actors'])
dictionary_genres = {}
for i in x:
    dictionary_genres[i] = df [df['Main_genre_le'] == i]['Main_genre'].values[0]

In [None]:
for i in range(0, len(dictionary_genres)):
    print(i, dictionary_genres.get(i))
print()
fig = plt.gcf()
fig.set_size_inches(20,20)
sns.catplot(x="Main_genre_le", y="Metascore", data=df)
plt.show()

### Comments of Metascore - genre relationship
As it can be seen from the plot, the highest metascore corresponds to the drama, animation, biography genres of movies.

The least metascore corresponds to the following genres: Romance, Sci-Fi Thriller, Fantasy, that can be the property of the data, because the number of movies of that genres is low.

### Metascore - Director relationship

In [None]:
# Unique directors
print(len(df['Director'].unique()))

As the length of the unique values of df['Director'] is not equal to 1000 (overall length), it can be estimated, which directors have the highest mean metascore.

In [None]:
x = df['Director'].value_counts()
directors = df.groupby('Director', as_index=False)['Metascore'].mean()
print('Table 1')
directors.sort_values(by=['Metascore'], ascending= False).head(30)

In [None]:
x = df['Title'].value_counts()
directors = df.groupby(['Title' ,'Director'], as_index=False)['Metascore'].mean()
print('Table 2')
directors.sort_values(by=['Metascore'], ascending= False).head(20)

### Comments of Metascore - director relationship
The Director feature is not a categorical variable, but it's quite interesting, that directors, whose
films are with the the top metascore, have the lower metascore on the other films.
That is so, because not everyone from the Table 2 appears in the first top-30 if the table 1.

## Correlation matrix

In [None]:
fig = plt.gcf()
fig.set_size_inches(8, 8)
sns.heatmap(df.corr(), cmap="YlGnBu",annot = True, linewidths=.5)
plt.title('Correlation matrix')
plt.show()

## Conclusion
The 'Metascore' feature is highly correlated with feature 'Rating' (0.63) and not highly with: 'Votes' (0.33) and 'Runtime' (0.21). 

Other significant values of correlation coefficients are:
- Votes and Revenue (0.63);
- Votes and Rating (0.51).

## Preprocessing. Dealing with missing data

In [None]:
for i in df.columns:
    print(i, df[i].isnull().values.sum())

The nan values of the Revenue (Millions) can be filled with the mean ones.  
The metascore values cannot be filled with the mean values, but they can be dropped.

In [None]:
df["Revenue (Millions)"] = df["Revenue (Millions)"].fillna(value=df["Revenue (Millions)"].mean())
df.dropna(inplace = True)

## Preprocessing. Outliers

In [None]:
numerical_vars = ['Rating', 'Votes', 'Revenue (Millions)', 'Metascore', 'Runtime (Minutes)']
for i in numerical_vars:
    fig = plt.gcf()
    sns.boxplot(y = df[i], color = 'purple')
    plt.show()

As it can be seen from the boxplots, the outliers take place in 'Rating', 'Votes', 'Revenue (Millions)', 'Runtime (Minutes)'.

To demonstrate, how the outliers affect, for instance, correlation coefficients, they may be excluded and coefficients recalculated.

## Correlation matrix

In [None]:
df1 = df[['Rating', 'Votes', 'Revenue (Millions)', 'Metascore', 'Runtime (Minutes)']]
z_scores = stats.zscore(df1)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df1 = df1[filtered_entries]
fig = plt.gcf()
sns.set(font_scale = 1.5 )
sns.heatmap(df1.corr(), cmap="YlGnBu",annot = True, linewidths=.5)
plt.title('Correlation matrix')
plt.show()

Thus, the correlation coefficients changed:

* Revenue - Votes correlation decreased from 0.63 to 0.58;
* Metascore - Revenue correlation coefficients decreased from 0.14 to 0.082, that is logically approven;
* 'Runtime'and ['Rating', 'Votes'] correlation coefficients also dropped from 0.39/0.41 to 0.34/0.33.

## Normality observation

In [None]:
obs = df['Metascore'].values
metascore_mean = df['Metascore'].mean()
metascore_std = df['Metascore'].std()
z = (obs-metascore_mean)/metascore_std 
stats.probplot(z, dist="norm", plot=plt)
plt.title("Q-Q plot")
plt.show()

In [None]:
# Shapiro-Wilk Test
stat, p = stats.shapiro(obs) 
print('Statistics = %.3f, p-value = %.9f' % (stat, p))
alpha = 0.05
if p > alpha:
    print('H0 is accepted')
else:
    print('H0 is rejected')

### Normality comments
The Q-Q plot shows the similarity between normal distribution and the distribution of the feature.  
As most of points are situated on the 45 degrees line, the similarity of the distributions can be proposed.  
To prove that the distributions are not similar the statistical tests may be used.  
As shown above, Shapiro-Wilk Test disapproved the null hypothesis, according to which the feature is normally distributed.

### Homoscedasticity
The 'Metascore' feature is correlated with "Votes", "Rating" and "Runtime (Minutes)".  
The homoscedacity is the equal level of variance of metascore predicted value.  
As "Votes" and "Rating" are correlated (0.5), for building the regression model features "Rating" and "Runtime" will be used, because their correlation coefficient is less: 0.33.  
For building a proper predicting model (y), though, the x variables should be independent.

In [None]:
x = df.loc[:, ('Rating', 'Runtime (Minutes)')]
y = df.loc[:, 'Metascore']
model = LinearRegression().fit(x, y)
y_pred = model.predict(x)

In [None]:
resid = y - y_pred
plt.scatter(y_pred, resid, c = 'orange')
plt.title('Variance of residuals')
plt.ylabel('Residuals')
plt.xlabel('Y_predicted')
plt.show()

### The results of the model

In [None]:
print('Metascore = b0 + b1*Rating + b2*Runtime (Minutes)')
print('b0:', model.intercept_)
print('b1:', model.coef_[0])
print('b2:', model.coef_[1])
print('Coefficient of determination: ', round(model.score(x, y), 4))

## Homoscedasticity comments
    - As it can be seen from the Variance of residuals scatterplot, there is no constant value of variance.
    - The absence of homoscedasticity is heteroscedasticity.

## Linearity observation (regplots)

In [None]:
sns.regplot(x="Metascore", y="Rating", data=df)
plt.title('Metascore-Rating regression')
plt.show()

In [None]:
sns.regplot(x="Metascore", y="Votes", data=df)
plt.title('Metascore-Votes regression')
plt.show()

In [None]:
sns.regplot(x="Metascore", y="Runtime (Minutes)", data=df)
plt.title('Metascore-Runtime regression')
plt.show()

## Results of regression model

In [None]:
fig = pyplot.figure()
ax = Axes3D(fig)

ax.scatter( x.values[:, 1], x.values[:, 0],y.values, c='purple')
plt.title('Metascore')
plt.ylabel('Rating')
plt.xlabel('Runtime (Minutes)')
pyplot.show()

In [None]:
fig = pyplot.figure()
ax = Axes3D(fig)
ax.scatter( x.values[:, 1], x.values[:, 0],y_pred, c='purple')
plt.title('Predicted Metascore')
plt.ylabel('Rating')
plt.xlabel('Runtime (Minutes)')
pyplot.show()

### Linearity comments
The Rating and Runtime features are linearly dependent on the Metascore.