In [1]:
# pearson, spearman, t-test, chisquare, regression

import numpy as np
from scipy.stats import pearsonr, spearmanr, chisquare, ttest_ind

from sklearn.linear_model import LinearRegression

import pandas as pd
from IPython.display import Markdown, display

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

In [2]:
ALPHA = 0.05
DEFAULT_COLOR = 'lightseagreen'

In [3]:
def table(data, cols):
    
    def bounded_join(inp, sep):
        return sep + sep.join(inp) + sep
    
    vsep, hsep = "|", "-"
    
    head = "{}\n{}".format(bounded_join(cols, vsep),
                           bounded_join([hsep] * len(cols), vsep))
    markdown = head
    
    for row in data:
        srow = list(map(str, row))
        markdown += "\n" + bounded_join(srow, vsep)
    
    display(Markdown(markdown))

In [4]:
df = pd.read_csv("day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


### Correlations

In [5]:
def corr_table(pairs):
    table = []
    for p in pairs:
        col_one, col_two = df[p[0]], df[p[1]]
        pearson = pearsonr(col_one, col_two)[0]
        spearman = spearmanr(col_one, col_two).correlation
        table.append(["{} / {}".format(p[0], p[1]), 
                  "{:.3f}".format(pearson), 
                  "{:.3f}".format(spearman)])
    return table

In [6]:
cols = ["variables", "pearson", "spearman"]

pairs = [["temp", "cnt"], ["windspeed", "cnt"], ["hum", "cnt"]]

table(corr_table(pairs), cols)

|variables|pearson|spearman|
|-|-|-|
|temp / cnt|0.627|0.622|
|windspeed / cnt|-0.235|-0.217|
|hum / cnt|-0.101|-0.098|

In [7]:
cols = ["variables", "pearson", "spearman"]

pairs = [["temp", "casual"], ["windspeed", "casual"], ["hum", "casual"]]

table(corr_table(pairs), cols)

|variables|pearson|spearman|
|-|-|-|
|temp / casual|0.543|0.667|
|windspeed / casual|-0.168|-0.180|
|hum / casual|-0.077|-0.071|

In [8]:
# correlation between binary and continuos variables

cols = ["variables", "pearson", "spearman"]

pairs = [["holiday", "cnt"], ["weathersit", "cnt"], ["yr", "cnt"]]

table(corr_table(pairs), cols)

|variables|pearson|spearman|
|-|-|-|
|holiday / cnt|-0.068|-0.064|
|weathersit / cnt|-0.297|-0.272|
|yr / cnt|0.567|0.572|

### T-test

In [9]:
def plotly_boxplot(input_data):
    visual_data = []
    for y in input_data:
        visual_data.append(go.Box(y=input_data[y], name=y, 
                                  marker=dict(color=DEFAULT_COLOR)))
    
    MARGIN = dict(l=50, r=30, b=20, t=30)
    PLOT_HEIGHT = 250
    PLOT_WIDTH = 300
    layout = go.Layout(height=PLOT_HEIGHT, width=PLOT_WIDTH, margin=MARGIN, 
                       showlegend=False)

    fig = go.Figure(data=visual_data, layout=layout)
    py.iplot(fig, show_link=False)

In [10]:
workday_cnt = df[df['workingday'] == 1]['cnt']
weekend_cnt = df[df['workingday'] == 0]['cnt']

first_year_cnt = df[df['yr'] == 0]['cnt']
second_year_cnt = df[df['yr'] == 1]['cnt']

In [11]:
plotly_boxplot({"workday" : workday_cnt, "weekend" : weekend_cnt})

In [12]:
plotly_boxplot({"1st year" : first_year_cnt, "2nd year" : second_year_cnt})

In [13]:
# H0: samples have equal means

In [14]:
# just example
# for t-test data must have normal distribution
# else non-parametric Mann-Whitney rank test recommended

t_weekdays = ttest_ind(workday_cnt, weekend_cnt)

t_years = ttest_ind(first_year_cnt, second_year_cnt)

cols = ["variables", "p-value", "H0 accepted"]

t_table = [["workday / weekend", "{:.2e}".format(t_weekdays.pvalue), 
            t_weekdays.pvalue > ALPHA], 
           ["1st year / 2nd year", "{:.2e}".format(t_years.pvalue), 
            t_years.pvalue > ALPHA]]

table(t_table, cols)

|variables|p-value|H0 accepted|
|-|-|-|
|workday / weekend|9.85e-02|True|
|1st year / 2nd year|2.48e-63|False|

### Linear Regression

In [15]:
def scatter(x, y, color, mode): 
    return go.Scatter(x=x, y=y,
                      mode=mode,
                      marker=dict(opacity=0.5, color=color))

In [16]:
x = np.array(df['temp']).reshape(-1, 1)
y = np.array(df['cnt']).reshape(-1, 1)
lr = LinearRegression().fit(x, y);

In [17]:
n = len(y)
lr_values = lr.predict(x).reshape(-1)
exp_values = np.array(y).reshape(-1)

In [18]:
print("a = {:.2f}".format(lr.coef_[0][0]))
print("b = {:.2f}".format(lr.intercept_ [0]))
print("r^2 = {:.2f}".format(np.sqrt(sum((lr_values-exp_values)**2)/(n))))

a = 6640.71
b = 1214.64
r^2 = 1507.32


In [19]:
x1, y1 = df['temp'], df['cnt']
x2, y2  = x.reshape(-1), lr.predict(x).reshape(1,-1)[0].reshape(-1)

In [20]:
points = scatter(x1, y1, DEFAULT_COLOR, 'markers')
line = scatter(x2, y2, 'black', 'lines')

MARGIN = dict(l=70, r=30, b=40, t=30)
PLOT_HEIGHT = 350
PLOT_WIDTH = 500

layout = go.Layout(height=PLOT_HEIGHT, width=PLOT_WIDTH, margin=MARGIN,
                   hovermode='closest', showlegend=False,
                  xaxis=dict(title='temp', titlefont=dict(size=15)), 
                   yaxis=dict(title='cnt', titlefont=dict(size=15)))

fig = go.Figure(data=[points, line], layout=layout)
py.iplot(fig, show_link=False)

In [21]:
def plotly_barchart(x, y):
    visual_data = [go.Bar(x=x, y=y, marker=dict(color=DEFAULT_COLOR))]

    margin = dict(l=40, r=30, b=30, t=30)
    layout = go.Layout(height=250, width=400, margin=margin)

    fig = go.Figure(data=visual_data, layout=layout)
    py.iplot(fig, show_link=False)

### Chi-square

In [22]:
def bar(x, y, color, name=None):
    return go.Bar(x=x, y=y, marker=dict(opacity=0.8, color=color), name=name)


def scatter(x, y, color, mode, symbol=None, marker_size=None, name=None): 
    return go.Scatter(x=x, y=y,
                      mode=mode, name=name,
                      line=dict(dash='dot'),
                      marker=dict(color=color, symbol=symbol, 
                                  size=marker_size))

In [23]:
def histogram(s, bins):
    hist, edges = np.histogram(s, bins=bins)
    centers = (edges[1:] + edges[:-1]) / 2
    return hist, centers


def normalize(s):
    return (s - np.mean(s)) / np.std(s)


def norm_distr(x):
    return np.exp((-x ** 2) / 2) / np.sqrt(2 * np.pi)

In [24]:
hum = df['hum']
hist, centers = histogram(normalize(hum), bins=np.linspace(-2, 2, 5))
x_interp = np.linspace(-2, 2)

In [25]:
hist_sum = sum(hist)
theory_hist =  hist_sum * norm_distr(centers)
theory_distr = hist_sum * norm_distr(-x_interp)

In [26]:
data = [bar(centers, hist, DEFAULT_COLOR, name="hist"),
    
        scatter(centers, theory_hist, 
                color='black', mode='markers', 
                symbol="x", marker_size=10, name="theory hist"),
    
        scatter(x_interp, theory_distr, 
                color='black', mode='lines', name="theory")]

MARGIN = dict(l=40, r=30, b=30, t=30)
PLOT_HEIGHT = 250
PLOT_WIDTH = 400

layout = go.Layout(height=PLOT_HEIGHT, width=PLOT_WIDTH, margin=MARGIN)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, show_link=False)

In [27]:
chi_data = []

for j in [4, 5, 6, 7]:
    hist, centers = histogram(normalize(hum), bins=np.linspace(-2, 2, j))

    pvalue = chisquare(hist, sum(hist) * norm_distr(centers)).pvalue
    
    chi_data.append([j, pvalue])

In [28]:
# H0: samples have equal distributions

In [29]:
chi_print = [[row[0], "{:.2e}".format(row[1]), row[1] > ALPHA] for row in chi_data]

table(chi_print, ["n bins","p-value", "H0 accepted"])

|n bins|p-value|H0 accepted|
|-|-|-|
|4|8.38e-21|False|
|5|3.20e-01|True|
|6|1.40e-06|False|
|7|5.04e-20|False|