In [None]:
import pandas as pd
import numpy as np
cab_data=pd.read_csv('../input/uber-cab-fare-price-analysis/cab_rides.csv')
cab_data

In [None]:
weather_data=pd.read_csv('../input/uber-cab-fare-price-analysis/weather.csv')
weather_data

Reducing the memory of the datasets

In [None]:
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [None]:
cab_data = reduce_memory_usage(cab_data)
weather_data = reduce_memory_usage(weather_data)

analysis of the data sets

In [None]:
cab_data.columns

In [None]:
weather_data.columns

In [None]:
cab_data.info()

In [None]:
weather_data.info()

In [None]:
cab_data.dtypes

coverting the datatype of time_stamp column to datetime 

In [None]:
import datetime
cab_data['date_time'] = pd.to_datetime(cab_data['time_stamp'])
weather_data['date_time'] = pd.to_datetime(weather_data['time_stamp'])
cab_data

In [None]:
cab_data.shape

In [None]:
weather_data.shape

coverting some columns to the desired datatype

In [None]:
cab_data['distance']=cab_data['distance'].astype('float')
cab_data['price']=cab_data['price'].astype('float')
cab_data['surge_multiplier']=cab_data['surge_multiplier'].astype('float')
cab_data.head()

In [None]:
weather_data['temp']=weather_data['temp'].astype('float')
weather_data['pressure']=weather_data['pressure'].astype('float')
weather_data.head()

In [None]:
cab_data.describe()

In [None]:
weather_data.describe()

concatinating the two data sets together

In [None]:
a=pd.concat([cab_data,weather_data])

In [None]:
a.head()

adding two new columns to the dataset

In [None]:
a['day']=a.date_time.dt.day
a['hour']=a.date_time.dt.hour

In [None]:
a.tail()

filling the missing attributes with 0

In [None]:
a.fillna(0,inplace=True)

In [None]:
a.head()

In [None]:
a.columns

grouping the dataset by cab types

In [None]:
a.groupby('cab_type').count()

In [None]:
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt


In [None]:
a.groupby('cab_type').count().plot.bar()

In [None]:
a.price.value_counts().plot(kind="bar",figsize=(100,50),color='red')

In [None]:
a.hour.value_counts().plot(kind="bar",figsize=(10,5),color='red')

In [None]:
sns.jointplot(a.hour,a.price)

In [None]:
x=a.price
y=a.hour
plt.plot(y,x)

In [None]:
sns.jointplot(a.rain,a.price)

In [None]:
a.columns

In [None]:
x1=a[(['distance', 'surge_multiplier', 'temp', 'clouds', 'pressure', 'rain', 'humidity', 'wind', 'day','hour'])]
y1=a[('price')]

train -test splitting

In [None]:
from sklearn.model_selection import train_test_split
x_train, y_train ,x_test, y_test = train_test_split(x1,y1,test_size=0.25,random_state =42)

importing the three main regressions for the pridiction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [None]:
#model=RandomForestRegressor()
#model=LinearRegression()
model=DecisionTreeRegressor()
model.fit(x_train,x_test)

In [None]:
pred=model.predict(y_train)
print(pred)

In [None]:
df=pd.DataFrame({"actual": y_test, "prediction": pred})
df

the error in predicition 

In [None]:
rsquare=model.score(x_train,x_test)
print (rsquare)