In [1]:

#Import relevant libraries and define settings for plotting.#Import r 
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import time
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.pyplot import *

sns.set()
pal = sns.hls_palette(10, h=.5)
sns.set_palette(pal)

#Avoid display of scientific notation and show precision of 4 decimals:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [2]:
df = pd.read_csv('C:/Users/shubhangipatil/Desktop/UBER/drivers_data.csv', parse_dates=['first_completed_trip'], infer_datetime_format=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20202 entries, 0 to 20201
Data columns (total 5 columns):
driver_id                   20202 non-null object
first_completed_trip        20202 non-null datetime64[ns]
lifetime_rating             19662 non-null float64
lifetime_fares              20202 non-null float64
lifetime_completed_trips    20202 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 789.2+ KB


In [4]:
#There are 0 duplicated rows (this takes a while to run):
dup=len(df[df.duplicated() == True])
dup

0

In [5]:
#Checking null values
df.isnull().any() 

driver_id                   False
first_completed_trip        False
lifetime_rating              True
lifetime_fares              False
lifetime_completed_trips    False
dtype: bool

In [6]:
#Check some of the data with missing driver rating. 
#One idea to fill missing values would be predict driver rating based on the other features.
df[df.lifetime_rating.isnull()].head()

Unnamed: 0,driver_id,first_completed_trip,lifetime_rating,lifetime_fares,lifetime_completed_trips
40,42f6-eeba,2012-01-06,,8518.36,1338
66,4025-dc78,2011-08-29,,1174.37,180
192,4e93-9e57,2012-04-05,,240.47,41
215,4e53-61b2,2012-03-23,,1001.88,154
261,4715-f438,2011-08-08,,369.5,51


In [7]:
#Parsing year, day, month and weekday in separate columns

df['Year'] = df.first_completed_trip.dt.year
df['Month'] = df.first_completed_trip.dt.month
df['Day'] = df.first_completed_trip.dt.day
df['Weekday'] = df.first_completed_trip.dt.dayofweek

In [8]:
df.head()

Unnamed: 0,driver_id,first_completed_trip,lifetime_rating,lifetime_fares,lifetime_completed_trips,Year,Month,Day,Weekday
0,48dc-c2db,2010-11-18,4.82,8038.1,877,2010,11,18,3
1,49aa-b176,2011-12-29,4.79,9428.02,967,2011,12,29,3
2,4332-47bc,2010-08-13,4.92,13578.02,1719,2010,8,13,4
3,4004-efca,2012-02-12,4.41,2020.95,172,2012,2,12,6
4,4efa-33c0,2011-04-28,4.61,15984.75,1755,2011,4,28,3


In [9]:
#Fill missing values of ratings based on completed trips and fares

df['lifetime_rating'] = df['lifetime_rating'].fillna(df['lifetime_rating'].mean()) 

df.head()

Unnamed: 0,driver_id,first_completed_trip,lifetime_rating,lifetime_fares,lifetime_completed_trips,Year,Month,Day,Weekday
0,48dc-c2db,2010-11-18,4.82,8038.1,877,2010,11,18,3
1,49aa-b176,2011-12-29,4.79,9428.02,967,2011,12,29,3
2,4332-47bc,2010-08-13,4.92,13578.02,1719,2010,8,13,4
3,4004-efca,2012-02-12,4.41,2020.95,172,2012,2,12,6
4,4efa-33c0,2011-04-28,4.61,15984.75,1755,2011,4,28,3


In [10]:
df.isnull().any()

driver_id                   False
first_completed_trip        False
lifetime_rating             False
lifetime_fares              False
lifetime_completed_trips    False
Year                        False
Month                       False
Day                         False
Weekday                     False
dtype: bool

In [11]:
# Filter Dataset, Define Functions, and Load Methods to be Used to Create Visualizations.

#Descriptive statistics for numerical features:
df_viz=df[['lifetime_rating', 'lifetime_fares', 'lifetime_completed_trips']].describe()
df_viz

Unnamed: 0,lifetime_rating,lifetime_fares,lifetime_completed_trips
count,20202.0,20202.0,20202.0
mean,4.7707,71465.2511,4890.4818
std,0.1169,5666502.1379,295071.8637
min,2.0,8.78,1.0
25%,4.72,6435.315,669.0
50%,4.79,17662.59,1802.0
75%,4.84,39661.01,3883.0
max,5.0,805410479.25,41940330.0


In [12]:
#Required by Plotly:
import matplotlib.mlab as mlab
import plotly.plotly as py

In [13]:
from matplotlib.ticker import FuncFormatter #Call formatter function to format tick values
from matplotlib.offsetbox import (OffsetImage, AnnotationBbox) #Create image box
from matplotlib._png import read_png #Load png file
from matplotlib.patches import Ellipse #Draw ellipse

In [14]:
#Create functions to format tick numbers
def thousands_comma(x, pos):
    """
    Args are the value and tick position. 
    Returns number with thousands comma and no decimals.
    """
    return '{:,.0f}'.format(x) #this is the new syntax for formatting

In [15]:
def thousands_format(x, pos):
    """
    Args are the value and tick position. 
    Returns number of thousands with one decimal, and K in lieu of 3 zeros.
    """
    return '{:.0f}{}'.format(x * 1e-3, 'K') #old syntax: '%1.0fK' % (x * 1e-3)

In [16]:
def thousands_currency(x, pos):
    """
    Args are the value and tick position. 
    Returns number of thousands with a $ sign, K in lieu of 3 zeros, and no decimals.
    """
    return '{:.0s}{}'.format('$', x * 1e-3, 'K')

In [17]:
#Define robust function to automatically add annotated labels on bar plots.
#Inspiration from http://composition.al/blog/2015/11/29/a-better-way-to-add-labels-to-bar-charts-with-matplotlib/

def annotate_labels(ax, labels_list, **kwargs):
    """
    Function to automatically add labels on bar charts.
    It takes a plot axis, an ordered list of labels, and text kwargs.
    """
    
    # Get y-axis height to calculate label position from.
    (y_bottom, y_top) = ax.get_ylim()
    y_height = y_top - y_bottom
    
    rects = ax.patches

    for rect, label in zip(rects, labels_list):
        height = rect.get_height()
        p_height = (height / y_height) # Fraction of axis height taken up by this rectangle
        label_position = height + (y_height * 0.01)
        
        # If we can fit the label above the column, do that;
        # otherwise, put it inside the column (or else, change ylim)
        #if p_height > 0.95:
        #    label_position = height - (y_height * 0.05)
        #else:
        #    label_position = height + (y_height * 0.01)

        ax.text(rect.get_x() + rect.get_width()/2., label_position, label, kwargs)
    return None

In [18]:
#Create dataframe with grouped by trips:#Create  
byTrips = df_viz.groupby('lifetime_fares')['lifetime_completed_trips']

In [19]:
df1=df[df.lifetime_rating >=1]
df2 = df1.groupby('lifetime_rating')['lifetime_completed_trips'].median()

In [20]:
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML


#plotly.offline.init_notebook_mode(connected=True)


init_notebook_mode(connected=True)

years = ['2006', '2007','2008','2009','2010','2011','2012']
# make list of ratings
ratings = []
for rating in df['lifetime_rating']:
    if rating not in ratings:
        ratings.append(rating)
# make figure
figure = {
    'data': [],
    'layout': {},
    'frames': []
}

# fill in most of layout
figure['layout']['xaxis'] = {'range': [0, 24], 'title': 'Day'}
figure['layout']['yaxis'] = {'title': 'Lifetime rides completed for an hour', 'type': 'log'}
figure['layout']['hovermode'] = 'closest'
figure['layout']['sliders'] = {
    'args': [
        'transition', {
            'duration': 400,
            'easing': 'cubic-in-out'
        }
    ],
    'initialValue': '2006',
    'plotlycommand': 'animate',
    'values': years,
    'visible': True
}
figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 500, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'quadratic-in-out'}}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

sliders_dict = {
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Year:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': []
}

# make data
year = 2006
for rating in ratings:
    dataset_by_year = df[df['Year'] == year]
    dataset_by_year_and_cont = df[df['lifetime_rating'] == rating]

    data_dict = {
        'x': list(dataset_by_year_and_cont['Day']),
        'y': list(dataset_by_year_and_cont['lifetime_completed_trips']),
        'mode': 'markers',
        'text': list(dataset_by_year_and_cont['lifetime_rating']),
        'marker': {
            'sizemode': 'area',
            'sizeref': 200000
           # 'size': list(dataset_by_year_and_cont['pop'])
        },
        'name': rating
    }
    figure['data'].append(data_dict)
    
# make frames
for year in years:
    frame = {'data': [], 'name': str(year)}
    for rating in ratings:
        dataset_by_year = df[df['Year'] == int(year)]
        dataset_by_year_and_cont = dataset_by_year[dataset_by_year['lifetime_rating'] == rating]

        data_dict = {
            'x': list(dataset_by_year_and_cont['Day']),
            'y': list(dataset_by_year_and_cont['lifetime_completed_trips']),
            'mode': 'markers',
            'text': list(dataset_by_year_and_cont['lifetime_rating']),
            'marker': {
                'sizemode': 'area',
                'sizeref': 200000
               # 'size': list(dataset_by_year_and_cont['pop'])
            },
            'name': rating
        }
        frame['data'].append(data_dict)

    figure['frames'].append(frame)
    slider_step = {'args': [
        [year],
        {'frame': {'duration': 300, 'redraw': False},
         'mode': 'immediate',
       'transition': {'duration': 300}}
     ],
     'label': year,
     'method': 'animate'}
    sliders_dict['steps'].append(slider_step)

    
figure['layout']['sliders'] = [sliders_dict]

#iplot(figure)
plotly.offline.iplot(figure)