In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
datetime_parser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
data = pd.read_table(
    'dog_rates_tweets.csv', 
    sep=',',  
    index_col=0, 
    header=1,
    names=['id', 'created_at', 'text'], 
    parse_dates=['created_at'],
    date_parser=datetime_parser
)

In [4]:
data['rating'] = data['text'].str.extract(r'(\d+(?:\.\d+)?)/10', expand=True).astype(float)

In [5]:
data = data[data['rating'].notnull() & (data['rating'] <= 20)]

In [6]:
def to_timestamp(input):
    return input.timestamp()

data['timestamp'] = data['created_at'].apply(to_timestamp)

In [7]:
slope, intercept, r_value, p_value, std_err = stats.linregress(data['timestamp'], data['rating'])

In [8]:
#y = mx+b
predicted = np.add(np.multiply(data['timestamp'], slope), intercept) 

In [17]:
residual = np.subtract(predicted, data['rating'])

In [23]:
print(slope)
print(intercept)
pd.options.display.float_format = '{:.2f}'.format
print(data['timestamp'])

5.80460992352e-08
-74.1978549816
id
905222050297643008   1504681929.00
905175402502660096   1504670807.00
905098956430086144   1504652581.00
905079268476145665   1504647887.00
904495094014861312   1504508609.00
904363433650515968   1504477219.00
904128876116410369   1504421296.00
903288181222772736   1504220859.00
903047250515025922   1504163416.00
902923889638072321   1504134005.00
902561653547552769   1504047641.00
902322102602227712   1503990528.00
902236863959900163   1503970205.00
901564209359704068   1503809832.00
901527884283752448   1503801171.00
901505596159119364   1503795857.00
901484571857911808   1503790845.00
901234726886932482   1503731277.00
901204319462187008   1503724027.00
901096853307097089   1503698406.00
900881534437265408   1503647070.00
900742018732871681   1503613806.00
900502499592990721   1503556701.00
900391075784916992   1503530135.00
900149487678750720   1503472536.00
900070456102932480   1503453693.00
900025498180493312   1503442975.00
899783474726535170 

In [10]:
print("P Value = ", p_value)
plt.figure(1)
plt.plot(data['created_at'], data['rating'], 'b.', alpha=0.5, label='Ratings')
x = plt.plot(data['created_at'], data['timestamp']*slope + intercept, 'r-', linewidth=3, label='Line of Best Fit')
plt.xticks(rotation=25)
plt.legend(loc='lower right')
plt.xlabel('Date of Tweet')
plt.ylabel('Rating')
plt.savefig('dog_ratings_1')

plt.figure(2)
plt.hist(residual, bins='auto')
plt.xlabel('Distance from Line of Best Fit')
plt.ylabel('Number of Points')
plt.savefig('dog_ratings_2')

P Value =  7.79578475263e-77


In [11]:
plt.clf()
plt.cla()
plt.close()