In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline

In [None]:
citibikes_2018_spring_file = "JC-201804-citibike-tripdata.csv"
citibikes_2018_summer_file = "JC-201808-citibike-tripdata.csv"
citibikes_2018_winter_file = "JC-201812-citibike-tripdata.csv"
citibikes_2019_spring_file = "JC-201903-citibike-tripdata.csv"

citibikes_2018_spring_df = pd.read_csv(citibikes_2018_spring_file)
citibikes_2018_summer_df = pd.read_csv(citibikes_2018_summer_file)
citibikes_2018_winter_df = pd.read_csv(citibikes_2018_winter_file)
citibikes_2019_spring_df = pd.read_csv(citibikes_2019_spring_file)

citibikes_2018_spring_summer_df = citibikes_2018_spring_df.append(citibikes_2018_summer_df)
citibikes_2018_spring_summer_winter_df = citibikes_2018_spring_summer_df.append(citibikes_2018_winter_df)
citibikes_2018_2019_spring_df = citibikes_2018_spring_df.append(citibikes_2019_spring_df)
bikes_info_df = citibikes_2018_2019_spring_df.append(citibikes_2018_spring_summer_winter_df)
#citibikes_2018_2019_spring_df.head()
#citibikes_2018_spring_summer_winter_df.head()
#bikes_info_df
bikes_info = bikes_info_df.drop(columns= ["start station id", 'start station name', 'end station id', 'end station name', 'bikeid'])
#bikes_info.head()
bikes_info_clean = bikes_info.dropna(axis=0)
#bikes_info_clean.head()
bikes_info_clean.to_csv('bikes_info_clean.csv')


In [None]:
bikes_info_clean.info

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'tripduration', y = 'birth year', data = bikes_info_clean)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'tripduration', y = 'gender', data = bikes_info_clean)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'tripduration', y = 'usertype', data = bikes_info_clean)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.scatterplot(x = 'tripduration', y = 'starttime', data = bikes_info_clean)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.scatterplot(x = 'gender', y = 'starttime', data = bikes_info_clean)

In [7]:
#Making binary classificaion for the response variable.
#Dividing birth year as under20,30,40,....,80, over 80 and bad by giving the limit for the quality
bins = (20, 30, 40, 50, 60, 70, 80, 150)
group_names = ['19 or younger', '20s', '30s','40s', '50s','60s', '70s']
bikes_info_clean['birth year'] = pd.cut(bikes_info_clean['birth year'], bins = bins, labels = group_names)

In [8]:
#assign a labels to our quality variable
label_quality = LabelEncoder()

In [9]:
#Bad becomes 0 and good becomes 1 
bikes_info_clean['birth year'] = label_quality.fit_transform(bikes_info_clean['birth year'])

In [10]:
bikes_info_clean['birth year'].value_counts()

2047      1
62772     1
25894     1
27943     1
5416      1
7465      1
1322      1
3371      1
13612     1
15661     1
9518      1
11567     1
54576     1
56625     1
50482     1
52531     1
64821     1
85507     1
58678     1
60727     1
38200     1
40249     1
34106     1
36155     1
46396     1
48445     1
42302     1
44351     1
132458    1
134507    1
         ..
51916     1
49869     1
56014     1
53967     1
10960     1
8913      1
15058     1
13011     1
2772      1
725       1
6870      1
4823      1
27352     1
25305     1
31450     1
29403     1
19164     1
17117     1
23262     1
21215     1
109280    1
107233    1
113378    1
111331    1
101092    1
99045     1
105190    1
103143    1
125672    1
0         1
Name: birth year, Length: 135511, dtype: int64

In [None]:
sns.countplot(bikes_info_clean['birth year'])

In [None]:
#Now seperate the dataset as response variable and feature variabes
X = bikes_info_clean.drop('birth year', axis = 1)
y = bikes_info_clean['birth year']

In [None]:
#Train and Test splitting of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [None]:
#see how our model performed
print(classification_report(y_test, pred_rfc))

In [None]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_rfc))

In [None]:
sgd = SGDClassifier(penalty=None)
sgd.fit(X_train, y_train)
pred_sgd = sgd.predict(X_test)

In [None]:
print(classification_report(y_test, pred_sgd))

In [None]:
print(confusion_matrix(y_test, pred_sgd))

Support Vector Classifier


In [None]:
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)

In [None]:
print(classification_report(y_test, pred_svc))

to increase our accuracy of models
Grid Search CV

In [None]:
#Finding best parameters for our SVC model =========> NEEDS TWEEKING
param = {
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_svc.fit(X_train, y_train)

In [None]:
#Best parameters for our svc model
grid_svc.best_params_

In [None]:
#run  SVC again with the best parameters.
svc2 = SVC(C = 1.2, gamma =  0.9, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))

Cross Validation Score for random forest and SGD

In [None]:
#Evaluation for random forest model using cross validation.
rfc_eval = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
rfc_eval.mean()