In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
data=pd.read_csv("TaxiFare.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
data.describe(include='all')

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data=data[data["amount"] >= 0]

In [None]:
len(data)

In [None]:
sns.countplot(x="amount",data=data)

In [None]:
sns.distplot(data["no_of_passenger"])

In [None]:
sns.distplot(data[data["amount"] < 100]["amount"], kde=False);

In [None]:
print(min(data["longitude_of_pickup"].min(), data["longitude_of_dropoff"].min()))

In [None]:
print(max(data["longitude_of_pickup"].max(), data["longitude_of_dropoff"].max()))

In [None]:
print(min(data["latitude_of_pickup"].min(), data["latitude_of_dropoff"].min()))

In [None]:
print(max(data["latitude_of_pickup"].max(), data["latitude_of_dropoff"].max()))

In [None]:
city_long_border = (-74.03,-73.75)
city_lat_border = (40.63,40.85)

data.plot(kind='scatter',x='longitude_of_pickup',y='latitude_of_pickup',color='red',s=0.2,alpha=.6)
plt.title("Pickups")
plt.ylim(city_lat_border)
plt.xlim(city_long_border)

In [None]:
city_long_border = (-74.03,-73.75)
city_lat_border = (40.63,40.85)

data.plot(kind='scatter',x='longitude_of_dropoff',y='latitude_of_dropoff',color='blue',s=0.2,alpha=.6)
plt.title("Pickups")
plt.ylim(city_lat_border)
plt.xlim(city_long_border)

In [None]:
def distance(lat1, lon1, lat2, lon2):
  p = 0.017453292519943295 # Pi/180
  a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p))/2
  return (0.6213712 * 12742 * np.arcsin(np.sqrt(a)))

In [None]:
data["distance_miles"] = distance(data["latitude_of_pickup"], data["longitude_of_pickup"], 
                                      data["latitude_of_dropoff"], data["longitude_of_dropoff"])

In [None]:
data.head()

In [None]:
data["distance_miles"].hist(bins=50, figsize=(12,4))
plt.title("Histogram ride distance in miles");

In [None]:
variables={'Monday' : 1 , 'Tuesday' : 2 , 'Wednesday' : 5 , 'Thursday' : 3 , 'Friday' : 7 
           ,'Saturday' : 4 , 'Sunday' : 6 }

year = {2012 : 1 , 2013 : 2 , 2011 : 3 , 2009 : 4 , 2010 : 5 , 2014 : 6 , 2015 :7}

data.drop(['date_time_of_pickup'],axis=1,inplace=True)
data.drop(['unique_id'],axis=1,inplace=True)


data.head()

In [None]:
x = data.drop(['amount'],axis=1)
y = data['amount']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=0)

In [None]:
print("Shape of x_train: ",x_train.shape)
print("Shape of x_test: ",x_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test: ",y_test.shape)

In [None]:
model_1r = LinearRegression()
model_1r.fit(x_train, y_train)

k=model_1r.score(x_train, y_train)
py=model_1r
print(k)
# x_train.head()
# y_train.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor

np.random.seed(10)
X = np.sort(17 * np.random.rand(17, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 10000, X.shape[0])

regressor = DecisionTreeRegressor(max_depth=10)

regressor.fit(X, y)

X_test = np.arange(1, 17, 0.01)[:, np.newaxis]
y_pred = regressor.predict(X_test)

In [None]:
model_1r =  DecisionTreeRegressor()
model_1r.fit(X, y)

model_1r.score(X, y)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gradient_boosting_regressor.fit(X_train, y_train)
y_pred = gradient_boosting_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

X_test_sorted = np.sort(X_test, axis=0)
y_pred_sorted = gradient_boosting_regressor.predict(X_test_sorted)

In [None]:
model_1r = GradientBoostingRegressor()
model_1r.fit(X_train, y_train)

model_1r.score(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)


random_forest_regressor = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)

random_forest_regressor.fit(X_train, y_train)
# rr=random_forest_regressor
y_pred = random_forest_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

X_test_sorted = np.sort(X_test, axis=0)
y_pred_sorted = random_forest_regressor.predict(X_test_sorted)

In [None]:
model_1r = RandomForestRegressor()
model_1r.fit(X_train, y_train)

model_1r.score(X_train, y_train)

In [None]:
ndf = pd.read_csv("TaxiFare.csv")

In [None]:
x = ndf.drop(['amount','date_time_of_pickup','unique_id'],axis=1)
y = ndf['amount']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=0)

In [None]:
x_train.info()
# y_train.info()

In [None]:
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gradient_boosting_regressor.fit(x_train, y_train)
pyr = gradient_boosting_regressor


In [None]:
import joblib  

joblib.dump(pyr, 'model.pkl') 

ridge_from_joblib = model = joblib.load("model.pkl")

In [None]:
def preprocess(lat1,lon1,lat2,lon2,nop):
    test_data=np.array([[lat1,lon1,lat2,lon2,nop]])
    trained_model=joblib.load("model.pkl")
    prediction=trained_model.predict(test_data)
    return prediction
t=preprocess(40.733143,-73.987130,40.758092,-73.991567,1)
print("Fare: ",t)