# Appendix: Statistics (Part 3)

## Correlation & Regression

### Getting and Preparing the Data (Part 1) 

In [None]:
import pandas as pd

In [None]:
movie = pd.read_csv("movies_metadata.csv", low_memory= False)

In [None]:
movie

In [None]:
movie.info()

In [None]:
pd.to_datetime(movie.release_date, errors = "coerce")

In [None]:
movie = movie.set_index(pd.to_datetime(movie.release_date, errors = "coerce")).drop(columns = ["release_date"])

In [None]:
movie.sort_index(inplace = True)

In [None]:
movie

In [None]:
df = movie.loc[:, ["title", "budget", "revenue"]].copy()

In [None]:
df

In [None]:
df.info()

In [None]:
df.budget = pd.to_numeric(df.budget, errors = "coerce")

### Getting and preparing the Data (Part 2) 

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.iloc[:, -2:]  = df.iloc[:, -2:] / 1000000

In [None]:
df

In [None]:
df.loc[df.title.isna()]

In [None]:
df.dropna(inplace = True)

In [None]:
df.info()

In [None]:
df.budget.value_counts()

In [None]:
df.revenue.value_counts()

In [None]:
df = df.loc[(df.revenue > 0) & (df.budget > 0)]

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sort_values("budget", ascending = False)

In [None]:
df.sort_values("revenue", ascending = False)

In [None]:
df.to_csv("bud_vs_rev.csv")

### How to calculate Covariance and Correlation 

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.mean()

In [None]:
df.var()

In [None]:
df.cov()

In [None]:
df.budget.cov(df.revenue)

In [None]:
df.corr()

In [None]:
df.budget.corr(df.revenue)

In [None]:
df.budget.cov(df.revenue) / (df.budget.std() * df.revenue.std())

In [None]:
np.cov(df.budget, df.revenue)

In [None]:
np.corrcoef(df.budget, df.revenue)

### Correlation and Scatterplots – visual Interpretation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
df.plot(kind = "scatter", x = "budget", y = "revenue", figsize = (15, 10), fontsize = 15)
plt.xlabel("Budget (in MUSD)", fontsize = 13)
plt.ylabel("Revenue (in MUSD)", fontsize = 13)
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.jointplot(data = df, x = "budget", y = "revenue", height = 8)
plt.show()

### A simple Linear Regression Model with numpy & scipy & seaborn

Create a __simple Linear Regression Model__ with the __independent variable Movie Budget__ and one __dependent variable Movie Revenue__. <br>
Visualize and interpret the __regression coefficients__.

![image.png](attachment:image.png)

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=2, suppress= True)

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
sns.set(font_scale=1.5)
sns.lmplot(data = df, x = "budget", y = "revenue", height = 8, ci = None)
plt.show()

__Linear Regression with numpy__

In [None]:
x = df.iloc[:, -2].values
x

In [None]:
y = df.iloc[:, -1].values
y

In [None]:
reg = np.polyfit(x = x, y = y, deg = 1)
reg

In [None]:
np.polyfit(x = df.budget, y = df.revenue, deg = 1)

In [None]:
X = np.array([min(x), max(x)]) #creating x and y values for regression line

In [None]:
X

In [None]:
Y = np.polyval(reg, X)

In [None]:
Y

In [None]:
plt.figure(figsize = (12, 8))
plt.plot(X, Y)
plt.scatter(x = x, y = y)
plt.xlabel("Budget (in MUSD)")
plt.ylabel("Revenue (in MUSD)")
plt.show()

__Linear Regression with scipy__

In [None]:
df

In [None]:
stats.linregress(x = df.budget, y = df.revenue)

### How to interpret Regression Coefficients (Intercept and Slope)

In [None]:
stats.linregress(x = df.budget, y = df.revenue)

In [None]:
plt.figure(figsize = (12, 8))
plt.plot(X, Y)
plt.grid()
plt.scatter(x = x, y = y)
plt.xlabel("Budget (in MUSD)", fontsize = 15)
plt.ylabel("Revenue (in MUSD)", fontsize = 15)
plt.title("Linear Regression Model Revenue vs. Budget", fontsize = 15)
plt.show()

In [None]:
df.budget.cov(df.revenue) / df.budget.var()

### Case Study (Part 1): The Market Model (Single Factor Model)

In [None]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [None]:
df = yf.download(["MSFT", "^SP500TR"], start = "2015-12-31", end = "2018-12-31")

In [None]:
df

In [None]:
df = df["Adj Close"]

In [None]:
df

In [None]:
ret = df.pct_change().dropna()

In [None]:
ret

In [None]:
ret.plot(x = "^SP500TR", y = "MSFT", figsize = (12, 8), kind = "scatter")
plt.grid()
plt.title("MSFT vs. SP500 (daily returns)", fontsize = 15)
plt.show()

In [None]:
r, p_value = stats.pearsonr(ret["^SP500TR"], ret.MSFT)

In [None]:
r

In [None]:
p_value

### Case Study (Part 2): The Market Model (Single Factor Model)

![image.png](attachment:image.png)

In [None]:
beta, intercept, rvalue, pvalue, stderr = stats.linregress(x = ret["^SP500TR"], y = ret.MSFT)

In [None]:
beta

In [None]:
intercept

In [None]:
rvalue

In [None]:
sns.set(font_scale=1.5)
sns.lmplot(data = ret, x = "^SP500TR", y = "MSFT", height = 8, ci = None)
plt.show()