## Simple Linear Regression

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('../../03_data-visualization/02_lab-matplotlib-seaborn/your-code/Fitbit2.csv') 
data.head()

Unnamed: 0,Date,Calorie burned,Steps,Distance,Floors,Minutes Sedentary,Minutes Lightly Active,Minutes Fairly Active,Minutes Very Active,Activity Calories,...,Distance_miles,Days,Days_encoded,Work_or_Weekend,Hours Sleep,Sleep efficiency,Yesterday_sleep,Yesterday_sleep_efficiency,Months,Months_encoded
0,2015-05-08,1934,905,0.65,0,1.355,46,0,0,1680,...,0.403891,Friday,4.0,1,6.4,92.086331,0.0,0.0,May,5
1,2015-05-09,3631,18925,14.11,4,611.0,316,61,60,2248,...,8.767545,Saturday,5.0,0,7.566667,92.464358,6.4,92.086331,May,5
2,2015-05-10,3204,14228,10.57,1,602.0,226,14,77,1719,...,6.567891,Sunday,6.0,0,6.45,88.761468,7.566667,92.464358,May,5
3,2015-05-11,2673,6756,5.02,8,749.0,190,23,4,9620,...,3.119282,Monday,0.0,1,5.183333,88.857143,6.45,88.761468,May,5
4,2015-05-12,2495,502,3.73,1,876.0,171,0,0,7360,...,2.317714,Tuesday,1.0,1,6.783333,82.892057,5.183333,88.857143,May,5


### Simple Regression Plot

In [None]:
sns.regplot(x="NumberOfAwakings", y="MinutesOfBeingAwake", data=data)


[MinutesOfBeingAwake] = b0 + b1*[NumberOfAwakings]

### Estimating OLS using statsmodels

In [None]:
import statsmodels.api as sm

In [None]:
help(sm.OLS)

In [None]:
X = data['NumberOfAwakings']
Y = data['MinutesOfBeingAwake']
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

### Estimating OLS parameters using scipy.stats

In [None]:
from scipy.stats import linregress
help(linregress)

In [None]:
X = data['NumberOfAwakings']
Y = data['MinutesOfBeingAwake']
slope, intercept, r_value, p_value, std_err  = linregress(X, Y)
print ('The slope is: ' + str(slope))
print ('The intercept is: ' + str(intercept))

#### Calculating Confidence Intervals

In [None]:
from scipy import stats
help(stats.t.interval)
d_freedom = len(Y) - 2
stats.t.interval(0.95, d_freedom, slope, std_err)

#### Predictions

In [None]:
predictions = [intercept + slope*x for x in X]
print(predictions)

In [None]:
np.corrcoef(Y, predictions)
plt.scatter(Y, predictions)

#### Calculating residuals

In [None]:
residuals = [Y[i] - predictions[i] for i in range(len(Y))]

## Using sklearn

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
help(linear_model.LinearRegression)

In [None]:
X = X.values.reshape(-1, 1)
lm = linear_model.LinearRegression()
model = lm.fit(X,Y)
lm.score(X,Y)

In [None]:
print(lm.intercept_)
print(lm.coef_)
predictions = lm.predict(X)
print(r2_score(Y, predictions))
print(mean_squared_error(Y, predictions))


## Logistic Regression

In [None]:
churn = pd.read_csv('../../04_bi-tableau/04_lab-bi-analysis-with-tableau/data/churn.csv')

In [None]:
churn.head()

In [None]:
churn.dtypes

In [None]:
churn['Churn']

In [None]:
Y = pd.DataFrame(np.where(churn['Churn']=='Yes', 1, 0))
X = churn[['MonthlyCharges']]
X = sm.add_constant(X)
model = sm.Logit(Y, X).fit()
model.summary()