In [None]:
import pandas as pd
import plotly.express as px
import calendar
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
data=pd.read_csv('My Uber Drives - 2016.csv')
data.info()

In [None]:
data.head()

## Structuring Data

In [None]:
data.rename(columns={'START_DATE*': 'START_DATE',
                    'END_DATE*': 'END_DATE',
                    'CATEGORY*': 'CATEGORY',
                    'START*': 'START',
                    'STOP*': 'STOP',
                    'MILES*': 'MILES',
                    'PURPOSE*': 'PURPOSE'},
            inplace=True
           )

data['START_DATE']=pd.to_datetime(data['START_DATE'], errors='coerce')
data['END_DATE']=pd.to_datetime(data['END_DATE'], errors='coerce')

# Adding necessary columns for EDA
data['START_MONTH']=data['START_DATE'].apply(lambda x: str(x)[5:7])
data['START_HOUR']=data['START_DATE'].apply(lambda x: str(x)[11:13])

data.head()

## Data Cleaning

In [None]:
data.isna().sum()

Dropping Null Records

In [None]:
data.dropna(thresh=3, inplace=True)
data.isna().sum()

In [None]:
data.duplicated().sum()

Dropping Duplicate Records

In [None]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

Filling Null Values

In [None]:
data['PURPOSE'].fillna(method='ffill', inplace=True)
data.isna().sum()

## Finding Insights

In [None]:
top_start=pd.DataFrame(data['START'].value_counts()[[0,2,3,4,5,6,7,8,9,10]])
top_start.reset_index(inplace=True)
top_start.columns=['START_LOCATION', 'FREQUENCY']
top_start

In [None]:
px.bar(top_start, x='FREQUENCY', y='START_LOCATION',
      title='10 Most Frequent Starting Location',
      color='START_LOCATION', color_discrete_sequence=px.colors.sequential.amp_r)

In [None]:
monthly_ride=pd.DataFrame(data['START_MONTH'].value_counts().sort_index()[1:])
monthly_ride.reset_index(inplace=True)
monthly_ride.columns=['MONTH', 'RIDE']
monthly_ride['MONTH']=monthly_ride['MONTH'].apply(lambda x: calendar.month_name[int(x)])
monthly_ride

In [None]:
px.bar(monthly_ride, x='MONTH', y='RIDE',
       title='Rides Taken per Month',
      color='MONTH', color_discrete_sequence=px.colors.qualitative.G10_r,
      )

In [None]:
data['START_DATE']=pd.to_datetime(data['START_DATE'], errors='coerce')
data.info()

In [None]:
data['TIME']=data['START_DATE'].dt.time

In [None]:
data.head()

In [None]:
data['PURPOSE'].value_counts()

In [None]:
sn.countplot(data['PURPOSE'], order=data['PURPOSE'].value_counts().index, palette='rocket',
            saturation=1)
plt.xticks(rotation=90)

In [None]:
data['RIDE_DURATION']=data['END_DATE']-data['START_DATE']
data['RIDE_DURATION']=data['RIDE_DURATION'].dt.total_seconds()/60

In [None]:
data.head()

In [None]:
def round(x):
    if x['START']==x['STOP']:
        return 'yes'
    else:
        return 'no'
    
data['ROUND_TRIP']=data.apply(round,axis=1)

In [None]:
data.head()

In [None]:
sn.countplot(data['ROUND_TRIP'], order=data['ROUND_TRIP'].value_counts().index, palette='rocket_r')

In [None]:
sn.countplot(data['PURPOSE'], order=data['PURPOSE'].value_counts().index, palette='rocket_r', saturation=1)
plt.xticks(rotation=90)

In [None]:
data['START_MONTH']=data['START_MONTH'].dt.month_name()
data.head()

In [None]:
data.head()

In [None]:
data['START'].replace('Kar?chi','Karachi', inplace=True)
data['STOP'].replace('Kar?chi','Karachi', inplace=True)

In [None]:
data.head()

In [None]:
c=data['START'].value_counts()

In [None]:
c[c>10].plot(kind='pie', shadow=True)


In [None]:
d=data['STOP'].value_counts()

In [None]:
d[d>10].plot(kind='pie', shadow=True)