In [None]:
%autosave 60

# NYC dataset

# Basics

## Import Libraries

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date
from pandas_profiling import ProfileReport

## For geomap purpose
import geopandas
import folium

## For checking missing values
import missingno as msno

## For test/train splititng
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
#! python3 -m pip install --user bokeh==2.3.3

## upload dataset

In [None]:
path_dataset = os.getenv("DirForLocalDataset") + '/axa_dataset'

In [None]:
df_bike = pd.read_csv(os.path.join(path_dataset, os.listdir(path_dataset)[1]))
df_bike.head()

# insight into data

In [None]:
print(f"Shape of dataset is {df_bike.shape}")
print(df_bike.dtypes)

In [None]:
df_bike.describe()

## Profiling

In [None]:
profile = ProfileReport(df_bike, title="Pandas Profiling Report")
#profile

## statistics

In [None]:
def draw_stats(df, n_rows=10, n_cols=2):
    
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(20,25))

    i = 1
    for col in df.columns:
        if df[col].dtype == np.float64 or df[col].dtype == np.int64:
            ax = plt.subplot(n_rows, n_cols, i)
            df[col].hist(bins=30)
            ax.set_title(col)
            i += 1
            ax = plt.subplot(n_rows, n_cols, i)
            df[col].hist(bins=30)
            ax.set_title(col+" (log scale)")
            ax.set_yscale('log')
            i += 1

    fig.tight_layout()
    plt.show()

In [None]:
draw_stats(df_bike)

## Check coorrelation

In [None]:
corr = df_bike.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

# Preprocessing

## Check null

In [None]:
df_bike.isnull().sum() * 100 / len(df_bike)

## How to deal with missing values?

In [None]:
msno.matrix(df_bike)

In [None]:
msno.heatmap(df_bike)

In [None]:
df_bike=df_bike.dropna(subset=['end station name']) #3% of data

In [None]:
df_bike.isnull().sum() * 100 / len(df_bike)

In [None]:
df_bike['dob_missing'] = ""
df_bike['dob_missing'] = False
df_bike.loc[df_bike[df_bike['birth year'].isnull()].index, 'dob_missing'] = True
df_bike.head(2)

In [None]:
df_bike[df_bike["dob_missing"]==True].groupby("gender")["dob_missing"].count()

In [None]:
df_bike[df_bike["dob_missing"]==True].groupby("usertype")["dob_missing"].count()

# Feature Engineering

## Time

### Convert timestamp to datetime variable

In [None]:
df_bike["starttime"] = pd.to_datetime(df_bike['starttime'], format='%Y-%m-%d %H:%M:%S') 
df_bike["stoptime"]  = pd.to_datetime(df_bike['stoptime'], format='%Y-%m-%d %H:%M:%S')

### Calculate Month/time/weekday from timetamp

In [None]:
df_bike['month']= df_bike['starttime'].dt.month_name()
df_bike['time'] =  df_bike['starttime'].dt.time
df_bike['weekday'] = df_bike['starttime'].dt.day_name()

In [None]:
from pandas.api.types import CategoricalDtype
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

cat_type = CategoricalDtype(categories=cats, ordered=True)
df_bike['weekday'] = df_bike['weekday'].astype(cat_type)

In [None]:
df_bike.head(2)

### Which year/month the dataset belongs to?

In [None]:
df_bike[df_bike['month']=="July"]['month'].count()

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'June', 'July'
sizes = [df_bike[df_bike['month']=="June"]['month'].count(), df_bike[df_bike['month']=="July"]['month'].count()]
explode = (0, 0.1)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
plt.pie(df_bike['month'].unique())

In [None]:
df_bike['month'].value_counts()

## Users

### Convert gender as string categorical

In [None]:
df_bike['gender'].unique()

In [None]:
genders = ['Unknown','Male','Female']
df_bike['gender']= df_bike['gender'].replace([0, 1, 2], genders)
df_bike["gender"] = df_bike["gender"].astype("category")
df_bike.head(2)

### Calculate age of the users

In [None]:
skip = (df_bike['birth year'] < 1946) | (df_bike['gender'] == 0) 
df_bike['age'] = (date.today().year - df_bike['birth year']).mask(skip,None)

In [None]:
df_bike.head(2)

## Geo Info

### Calculate distance between start/end

In [None]:
def get_distance(lat1, lon1, lat2, lon2):
    earth_radius=6371
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))    

In [None]:
df_bike['dist_km'] = get_distance(df_bike["start station latitude"], df_bike["start station longitude"], 
                                    df_bike["end station latitude"], df_bike["end station longitude"])

## Trip Information

### Lets find tripduration in minutes/hour

In [None]:
# Collect all trips shorter than 1 hour
#duration_mins = df_bike.loc[(df_bike.tripduration / 60 < 60)][['tripduration']]
df_bike['duration_mins'] = df_bike['tripduration']/60
df_bike['duration_hr'] = df_bike['tripduration']/(60*60)

## Data Analysis

###  How users are distributed during Weekdays?

In [None]:
fig, ax = plt.subplots(figsize=(8,6))

df_bike[df_bike['usertype']=="Customer"].groupby("weekday")["usertype"].count().plot(ax=ax, label='Customer', marker='o',legend=True)
df_bike[df_bike['usertype']=="Subscriber"].groupby("weekday")["usertype"].count().plot(ax=ax, label='Subscriber', marker='o', legend=True)
plt.title("Distribution of Usertype during Weekday")
plt.xlabel("Weekday")
plt.ylabel('Count')
plt.savefig("usertype_during_weekday.png")

### What are our users?

In [None]:
def get_label_size_list(df, column):
    label_list= []
    size_list = []
    
    value_list = df[column].unique()
    [label_list.append(val) for val in value_list]
    [size_list.append(df[df[column]==val][column].count()) for val in value_list]
    
    return label_list, size_list

In [None]:
get_label_size_list(df_bike, 'gender')

In [None]:
# Pie chart
labels = get_label_size_list(df_bike, 'gender')[0]
sizes =  get_label_size_list(df_bike, 'gender')[1]
explode = (0, 0.1,0)  

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  

plt.show()
plt.savefig('gender_pie_dist.png')

### Age boxplot

In [None]:
sns.set_theme(style="whitegrid")
sns.boxplot(x=df_bike["age"], width=0.5)
plt.savefig('age_boxplot.png')

###  Age vs Weekday Distribution

In [None]:
sns.set(rc={'figure.figsize':(8,5)})
sns.set_theme(style="whitegrid")
bplot=sns.boxplot(x="weekday", y="age", palette="colorblind", data=df_bike)
bplot.axes.set_title("Distribution of age during Weekday", fontsize=16)
bplot.set_ylabel("Age", fontsize=14)
bplot.set_xlabel("Weekday", fontsize=14)
plt.savefig('age_weekday_boxplot.png')

### How many Unique bikes are avaialble?

In [None]:
bplot=sns.histplot(df_bike["bikeid"].unique())
bplot.set_xlabel("Unique BikeId", fontsize=14)
plt.savefig("Unique_bikes.png")
print("number of unique bike-id is: ", df_bike["bikeid"].unique().shape)

### Distribution of age usage

In [None]:
sns.displot(data=df_bike, x='age', hue='gender', multiple='stack',  aspect=10/5)

### Which station had the oldest riders on average

In [None]:
df_bike.groupby(by=["start station id"]).mean().sort_values(by=['birth year'])

In [None]:
df_bike.groupby(["starttime", "usertype"]).count()

In [None]:
df_bike[df_bike['usertype']=="Customer"].groupby(["weekday",'tripduration']).count()

In [None]:
df_bike.set_index('starttime', inplace=True)

### Trip duration

In [None]:
sns.displot(data=df_bike, x="dist (km)",  aspect=10/5)

### Distribution of trip duration

In [None]:
plt.rcParams.update({'font.size': 16})
plt.hist(df_bike['duration_mins'][df_bike['duration_mins']<60])
plt.tick_params(axis = 'both', which = 'major')
plt.title('Distribution of trip durations\n')
plt.xlabel('Duration (min.)')
plt.ylabel('Trip counts')

### Usertype vs time

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
df_bike.groupby('usertype')['tripduration'].plot(legend=True, ax=ax)

### User/Gender Distribution

In [None]:
trip_counts = df_bike.groupby('gender')['gender'].count()
# Plot
plt.rcParams.update({'font.size': 12})
plt.barh(gender, trip_counts / 1000000, align = 'center', alpha = 0.5)
plt.xlabel('Millions of trips')
plt.title('Distribution of trips according to user gender')
plt.show()

### Gender vs usertype

In [None]:
df_bike.groupby(["usertype"])['gender'].unique()

### How usertype are divided during month 

In [None]:
df_bike.reset_index()

# plot data
fig, ax = plt.subplots(figsize=(15,7))
df_bike.groupby(['starttime','usertype']).count()['tripduration'].unstack().plot(ax=ax)

## Feature Enginerring

In [None]:
print(df_bike.dtypes)
df_bike.head()

### Drop unnecessery Features

In [None]:
feature_to_drop=[""]
df_bike=df_bike.drop(feature_to_drop, axis=1)
df_bike.head()

In [None]:
n_bins=30
x = np.random.randn(1000, 3)
colors = ['blue', 'orange', 'green']
plt.hist(x, n_bins, density=1, histtype='bar', stacked=True, label=colors)
plt.legend(loc="upper right")
plt.title('Stacked-histogram ')
plt.show()

In [None]:
list_for_boxplot= ["start station id", "end station id"]
#sns.boxplot(x='variable', y='value', data=pd.melt(df_bike[list_for_boxplot]))
sns.boxplot(x='variable', y='value', data=pd.melt(df_bike[["start station id"]]))

### Geography 

In [None]:
gdf = geopandas.GeoDataFrame(df_bike, geometry=geopandas.points_from_xy(df_bike["start station longitude"], 
                                                                        df_bike["start station latitude"]))

In [None]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

# We restrict to South America.
ax = world[world.continent == 'North America'].plot(
    color='white', edgecolor='black')
# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='red')

plt.show()

In [None]:
plt.hist(df_bike["start station id"].unique())

In [None]:
plt.hist(df_bike["start station id"])
print("number of unique start-station-id is: ", df_bike["start station id"].unique().shape)

In [None]:
sns.histplot(df_bike["end station id"].unique())
print("number of unique end-station-id is: ", df_bike["end station id"].unique().shape)

In [None]:
sns.countplot(data=df_bike, x='start station id', order=df_bike["start station id"].value_counts().index)

In [None]:
sns.displot(data=df_bike, x="age", hue='usertype', multiple='stack', aspect=10/5); 

In [None]:
plt.boxplot(df_bike['duration_mins'])
print(df_bike['duration_mins'].describe())

### correlaton b/w unknown-gender-category vs absurd-DOB

In [None]:
df_bike['gender'] = df_bike.gender.astype('category')
plt.scatter(df_bike['birth year'], df_bike['gender'])

In [None]:
df[df_bike['usertype']=='Customer']

In [None]:
#df_bike[df_bike['usertype']=="Customer"].groupby("weekday")["usertype"].count().plot(ax=ax, label='Customer', marker='o',legend=True)

sns.histplot(df[df_bike['usertype']=='Customer']['tripduration'])

In [None]:
n_bins=30
plt.hist(df_bike['age'], n_bins, density=1, histtype='bar', stacked=False, label=df_bike['gender'])
plt.legend(loc="upper left")
plt.title('Stacked-histogram ')
plt.show()

In [None]:
df_bike['start station name'] = df_bike["start station name"].astype('category')
df_bike["start station name"].value_counts().sort_values()[:30].plot(kind='bar')


#top10 = month_jan[feature_cols].sum().sort_values(ascending=0).head(10)
#top10.plot(kind='bar')

In [None]:
df_bike.groupby('start station name')['usertype'].filter(lambda x: len(x)>5700)


In [None]:
df_bike['start station name'].value_counts() > 5000

## Modelling

### Splitting of train / val / test dataset

In [None]:
def get_class_counts(df, label, count_var):
    '''
    param df : pandas dataframe
    param label : label as string
    
    Give class counts of the label in df
    
    Eg: df = pd.DataFrame({'Team':['Riders', 'Riders', 'Riders', 'Kings','Kings'], 
                           'year':['2016','2016','2015','2018','2016']})
       get_class_counts(df, label='Team', count_var='year') will give you:
       {'Kings': 2, 'Riders': 3}  
    '''
    grp = df.groupby([label])[count_var].count()
    return {key: grp[key] for key in list(grp.keys())}


def get_class_proportions(df, label, count_var):
    '''
    param df: pandas dataframe
    param label : label as string
    
    Give proportion of the label class in df dataset. 
    
    Eg: df = pd.DataFrame({'Team':['Riders', 'Riders', 'Riders', 'Kings','Kings'], 
                           'year':['2016','2016','2015','2018','2016']})
        get_class_proportions(df, label='Team', count_var='year') will give you:
        {'Kings': 0.4, 'Riders': 0.6}
    '''
    class_counts = get_class_counts(df, label, count_var)
    return {val[0]: round(val[1]/df.shape[0],4) for val in class_counts.items()}

In [None]:
from sklearn.model_selection import train_test_split

def data_split(dataset, label, count_var, train_frac, random_state=123):
    ''' 
    param dataset  : Data to be split
    param label    : label as string
    param train_frac: Ratio of train set to whole dataset

    Randomly split dataset, based on these ratios:
        'train': train_frac
        'valid': (1-train_frac) / 2
        'test':  (1-train_frac) / 2

    Eg: passing train_frac=0.8 gives a 80% / 10% / 10% split
    '''

    assert train_frac >= 0 and train_frac <= 1, "Invalid training set fraction"

    train, tmp = train_test_split(dataset, train_size=train_frac, random_state=random_state, 
                                  stratify=dataset[label])
    val, test  = train_test_split(tmp, train_size=0.5,random_state=random_state, 
                                  stratify=tmp[label])
    
    print(f"Class proportion: \n in train: {get_class_proportions(train, label, count_var)} \n in val: {get_class_proportions(val, label, count_var)} \n in test: {get_class_proportions(test, label, count_var)}")
    return train, val, test

In [None]:
train, val, test = data_split(df_bike, label='usertype', count_var='gender',train_frac=0.8)

# Accident dataset

## Upload Dataset

In [None]:
df_ins = pd.read_csv(os.path.join(path_dataset, os.listdir(path_dataset)[0]))
print(f"shape of insurance dataset is {df_ins.shape}")
df_ins.head()

## Insight into data

In [None]:
draw_stats(df_ins, n_rows=11, n_cols=2)

In [None]:
df_ins.describe()

In [None]:
df_ins.isnull().sum()

## Feature Enginurring

In [None]:
list_for_boxplot=["NUMBER OF MOTORIST INJURED", "NUMBER OF MOTORIST KILLED", "NUMBER OF PERSONS INJURED", 
                  "NUMBER OF PERSONS KILLED", "NUMBER OF PEDESTRIANS INJURED","NUMBER OF PEDESTRIANS KILLED"]
sns.boxplot(x='variable', y='value', data=pd.melt(df_ins[list_for_boxplot]))

In [None]:
df_ins['BOROUGH'] = df_ins.BOROUGH.astype('category')
df_ins["BOROUGH"].value_counts().plot(kind='bar')