In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from dateutil.parser import parse
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('C:/Users/12077/Desktop/renjie/job/SE/files.csv',usecols = [1,2,3,4,5,6])

In [3]:
df.head(5)

Unnamed: 0,number,available_bikes,available_bike_stands,weather,temperature,time
0,2,2,18,light rain,6.13,2019-04-15_12:25:17
1,3,3,17,light rain,6.13,2019-04-15_12:25:17
2,4,19,1,light rain,6.13,2019-04-15_12:25:17
3,5,5,35,light rain,6.13,2019-04-15_12:25:17
4,6,1,19,light rain,6.13,2019-04-15_12:25:17


In [4]:
df['time'] = pd.to_datetime(df['time'],format="%Y-%m-%d_%H:%M:%S")
df['weekday'] = [i.weekday()+1 for i in df['time']]
df['hour']  = [i.hour for i in df['time']]
df = df.set_index('time',drop = False).groupby('number').resample('H').first().ffill().reset_index(drop=True).set_index('time',drop = True)
df


Unnamed: 0_level_0,number,available_bikes,available_bike_stands,weather,temperature,weekday,hour
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-03-15 11:06:41,2,0,20,light intensity shower rain,9.92,5,11
2019-03-15 12:05:24,2,0,20,broken clouds,9.75,5,12
2019-03-15 13:06:31,2,1,19,broken clouds,10.28,5,13
2019-03-15 14:05:59,2,0,20,shower rain,9.88,5,14
2019-03-15 15:05:24,2,0,20,shower rain,10.04,5,15
2019-03-15 16:04:51,2,1,19,light intensity shower rain,9.35,5,16
2019-03-15 17:04:40,2,1,19,light intensity shower rain,8.61,5,17
2019-03-15 18:03:55,2,4,16,broken clouds,8.38,5,18
2019-03-15 19:03:08,2,12,8,light intensity shower rain,7.48,5,19
2019-03-15 20:02:33,2,13,7,light rain,6.82,5,20


In [5]:
df.dtypes

number                     int64
available_bikes            int64
available_bike_stands      int64
weather                   object
temperature              float64
weekday                    int64
hour                       int64
dtype: object

In [6]:
np.unique(df['weather'].values)

array(['broken clouds', 'clear sky', 'drizzle', 'few clouds', 'fog',
       'heavy intensity rain', 'light intensity drizzle',
       'light intensity drizzle rain', 'light intensity shower rain',
       'light rain', 'mist', 'moderate rain', 'overcast clouds',
       'scattered clouds', 'shower rain'], dtype=object)

In [7]:
df[['weather','hour','weekday','number']]=df[['weather','hour','weekday','number']].astype('category')
df.dtypes

number                   category
available_bikes             int64
available_bike_stands       int64
weather                  category
temperature               float64
weekday                  category
hour                     category
dtype: object

In [8]:

def split_set_by_hand(df):
    '''this function is designed to split the train/test set by hand'''
    test = df.truncate(after = '2019-04-08')
    train = df.truncate(before = '2019-04-08')
    y_train = train['available_bike_stands'].reset_index(drop=True)
    X_train = train[['number','weather','hour','temperature','weekday']].reset_index(drop=True)
    y_test = test['available_bike_stands'].reset_index(drop=True)
    X_test = test[['number','weather','hour','temperature','weekday']].reset_index(drop=True)
#     y_train = train['available_bike_stands']
#     X_train = train[['number','weather','hour','temperature','weekday']]
#     y_test = test['available_bike_stands']
#     X_test = test[['number','weather','hour','temperature','weekday']]
    return (X_train,X_test,y_train,y_test)

In [9]:
#aquire a special set with a fixed station and fixed weekday
df1 = df[(df['number']== 50) & (df['weekday'] == 1)]
df1

Unnamed: 0_level_0,number,available_bikes,available_bike_stands,weather,temperature,weekday,hour
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-03-18 00:05:35,50,8,32,broken clouds,4.48,1,0
2019-03-18 01:03:51,50,10,30,broken clouds,4.49,1,1
2019-03-18 02:01:37,50,10,30,broken clouds,4.91,1,2
2019-03-18 03:05:53,50,10,30,broken clouds,4.91,1,3
2019-03-18 04:03:48,50,10,30,light rain,5.00,1,4
2019-03-18 05:01:40,50,10,30,light rain,5.00,1,5
2019-03-18 06:06:25,50,6,34,light rain,5.00,1,6
2019-03-18 07:04:08,50,1,39,light rain,5.00,1,7
2019-03-18 08:02:30,50,1,39,light rain,5.21,1,8
2019-03-18 09:01:34,50,0,40,broken clouds,6.20,1,9


In [10]:
# split the special set into train/test set
X_train, X_test, y_train, y_test = split_set_by_hand(df1)
print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))
dicv = DictVectorizer(sparse = False)
X_train = dicv.fit_transform(X_train.to_dict(orient="records"))
X_test = dicv.transform(X_test.to_dict(orient="records"))
dicv.get_feature_names()

Training data:
    number                       weather hour  temperature weekday  \
0      50                 broken clouds    0         6.00       1   
1      50                 broken clouds    1         6.01       1   
2      50                 broken clouds    2         6.38       1   
3      50                 broken clouds    3         6.75       1   
4      50                 broken clouds    4         7.19       1   
5      50                          mist    5         7.38       1   
6      50                          mist    6         7.53       1   
7      50                          mist    7         7.98       1   
8      50       light intensity drizzle    8         8.62       1   
9      50                          mist    9         8.83       1   
10     50                          mist   10         8.82       1   
11     50                    light rain   11         8.93       1   
12     50                 moderate rain   12         8.09       1   
13     50         

['hour',
 'number',
 'temperature',
 'weather=broken clouds',
 'weather=light intensity drizzle',
 'weather=light intensity drizzle rain',
 'weather=light rain',
 'weather=mist',
 'weather=moderate rain',
 'weekday']

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

def predict_select( X_train, X_test, y_train, y_test):
    dicv = DictVectorizer(sparse = False)
#     pca = PCA(n_components=0.9)
    X_train = dicv.fit_transform(X_train.to_dict(orient="records"))
    X_test = dicv.transform(X_test.to_dict(orient="records"))
#     X_train = pca.fit_transform(X_train)
#     X_test = pca.fit_transform(X_test)
    
    dtr = tree.DecisionTreeRegressor()
    gc1 = GridSearchCV(dtr, param_grid={'max_depth':[5,15,25,35,45,55]}, cv=8)
    gc1.fit(X_train, y_train)
    print('the decision tree result:')
    print("the score of predicting the test sets is :", gc1.score(X_test, y_test))
    print("Mean cross-validated score is : ", gc1.best_score_)
    print("the best params is :", gc1.best_params_)
    print("the max of the mean_test_score is :",gc1.cv_results_['mean_test_score'].max())
    print()

    nbrs = KNeighborsClassifier()
    gc2 = GridSearchCV(nbrs, param_grid={'n_neighbors':[2,3,4,5,6]}, cv=8)
    gc2.fit(X_train, y_train)
    print('the K neighbors result:')
    print("the score of predicting the test sets is :", gc2.score(X_test, y_test))
    print("Mean cross-validated score is :", gc2.best_score_)
    print("the best params is :", gc2.best_params_)
    print("the max of the mean_test_score is :",gc2.cv_results_['mean_test_score'].max())
    print()

    rfc = RandomForestClassifier()
    gc3 = GridSearchCV(rfc, param_grid={"n_estimators": [25,100], "max_depth": [25,35]}, cv=8)
    gc3.fit(X_train, y_train)
    print('the random forest result:')
    print("the score of predicting the test sets is :", gc3.score(X_test, y_test))
    print("Mean cross-validated score is :", gc3.best_score_)
    print("the best params is :", gc3.best_params_)
    print("the max of the mean_test_score is :",gc3.cv_results_['mean_test_score'].max())
    print()

    mlt = MultinomialNB()
    gc4 = GridSearchCV(mlt, param_grid={"alpha": [0.01,0.05,0.1,0.5,1]}, cv=8)
    gc4.fit(X_train, y_train)
    print('the naive beyas result:')
    print("the score of predicting the test sets is :", gc4.score(X_test, y_test))
    print("Mean cross-validated score is :", gc4.best_score_)
    print("the best params is :", gc4.best_params_)
    print("the max of the mean_test_score is :",gc4.cv_results_['mean_test_score'].max())  
    print()
    
    max_score_index = gc1
    for i in [gc2,gc3,gc4]:
        if i.cv_results_['mean_test_score'].max()>max_score_index.cv_results_['mean_test_score'].max():
            max_score_index = i
    print("the max mean test score result is :",max_score_index.cv_results_['mean_test_score'].max())
    print("the estimator is :",max_score_index.best_estimator_)  

    return max_score_index.best_params_

In [12]:
df['temperature']=abs(df['temperature'])
df.reset_index(drop = True,inplace= True)
X_train, X_test, y_train, y_test = split_set_by_hand(df)

In [13]:
#train/test set is not random
predict_select(X_train, X_test, y_train, y_test)

the decision tree result:
the score of predicting the test sets is : 0.007833465843840481
Mean cross-validated score is :  -0.14702063105787558
the best params is : {'max_depth': 5}
the max of the mean_test_score is : -0.14702063105787558

the K neighbors result:
the score of predicting the test sets is : 0.05928677563150074
Mean cross-validated score is : 0.03281961689648718
the best params is : {'n_neighbors': 6}
the max of the mean_test_score is : 0.03281961689648718



KeyboardInterrupt: 

In [None]:
# train/test set is random
y = df['available_bike_stands']
X = df[['number','weather','hour','temperature','weekday']]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)
predict_select(X_train, X_test, y_train, y_test)

As shown above, it is clear to see that the DecisionTreeRegressor estimator has the highest accuracy.so we choose DecisionTreeRegressor as our prediction function, Meanwhile the best parameter(max_depth) is 15

In [None]:
import joblib
def predict_model(features,target):
    dicv = DictVectorizer(sparse = False)
    X = dicv.fit_transform(features.to_dict(orient="records"))
    y = target
    dtr = tree.DecisionTreeRegressor(max_depth = 15)
    dtr.fit(X,y)
    joblib.dump(dtr,'dbbikes_model.pkl')

In [None]:
df = df.reset_index(drop=True)
y = df['available_bike_stands']
X = df[['number','weather','hour','temperature','weekday']]
predict_model(X,y)

In [None]:
import time
import datetime
import os
def TimeStampToTime(timestamp):
    timeStruct = time.localtime(timestamp)
    return time.strftime('%Y-%m-%d',timeStruct)

def get_FileModifyTime(filePath):
    t = os.path.getmtime(filePath)
    return TimeStampToTime(t)

# aquire the 'dbbikes_model.pkl' last modified time and the current time,if they are the same day, system will not train the data again 
file_modify_time =  get_FileModifyTime('dbbikes_model.pkl')
now = TimeStampToTime(time.time())

if now == file_modify_time:
    model = joblib.load("dbbikes_model.pkl")
else:
    df = df.reset_index(drop=True)
    y = df['available_bike_stands']
    X = df[['number','weather','hour','temperature','weekday']]
    predict_model(X,y)
    model = joblib.load("dbbikes_model.pkl")
    

In [None]:
def create_predict_set(station,weekday):
    dict = {}
    dict['hour'] = [i for i in range(24)]
    dict['weekday'] = [weekday for i in range(24)]
    dict['number'] = [station for i in range(24)]
    dict['weather'] = ['broken clouds', 'clear sky', 'drizzle', 'few clouds','broken clouds','broken clouds', 'clear sky', 'drizzle', 'few clouds','broken clouds', 'clear sky', 'drizzle', 'few clouds', 'fog',
       'heavy intensity rain', 'light intensity drizzle',
       'light intensity drizzle rain', 'light intensity shower rain',
       'light rain', 'mist', 'moderate rain', 'overcast clouds',
       'scattered clouds', 'shower rain'] 
    dict['temperature'] =[6.00 for i in range(24)]
    df = pd.DataFrame(dict)
    df[['weather','hour','weekday','number']]=df[['weather','hour','weekday','number']].astype('category')
    dicv = DictVectorizer(sparse = False)
    df = dicv.fit_transform(df.to_dict(orient="records"))    
    return df
df2 =create_predict_set(2,5)
model.predict(df2)