In [None]:
#Загружаем все библиотеки и данные 

from tensorflow.keras import losses, models, optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, Dropout, Activation, Flatten) 
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.datasets import load_boston 
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from geopy import distance

data = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", sep=',', nrows=7000, parse_dates=["pickup_datetime"])
#Чистим и подготавливаем данные

data = data.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(data))

pd.set_option('display.expand_frame_repr', False) 
print(data.head())
print(data.corr())

print((data.pickup_latitude>90).sum())
print((data.pickup_latitude<-90).sum())
print((data.pickup_longitude>90).sum())
print((data.pickup_longitude<-90).sum())

print((data.dropoff_latitude>90).sum())
print((data.dropoff_latitude<-90).sum())
print((data.dropoff_longitude>90).sum())
print((data.dropoff_longitude<-90).sum())

data_n = data.drop(data[(data.pickup_latitude>90) | (data.pickup_latitude<-90) 
| (data.pickup_longitude>90) | (data.pickup_longitude<-90) | (data.dropoff_latitude>90) 
| (data.dropoff_latitude<-90) | (data.dropoff_longitude>90) | (data.dropoff_longitude<-90) 
].index)

data_n = data_n[(data_n.pickup_latitude != 0)]
data_n = data_n[(data_n.pickup_longitude != 0)]
data_n = data_n[(data_n.dropoff_longitude != 0)]
data_n = data_n[(data_n.dropoff_latitude != 0)]

data_n = data_n[(data_n.dropoff_latitude != data_n.pickup_latitude) & (data_n.dropoff_longitude != data_n.pickup_longitude) ]
data_n = data_n.reset_index(drop=True)

print('New size: %d' % len(data_n))

In [None]:
#Расчёт расстояний между точкой посадки и точкой высадки
distance_miles = []

for i in range(len(data_n.pickup_latitude)):
  distance_miles.append(distance.distance((data_n.pickup_latitude[i], data_n.pickup_longitude[i]), (data_n.dropoff_latitude[i], data_n.dropoff_longitude[i])).km)

data_n['distance_miles'] = distance_miles 
print(data_n.head())

is_null1 = pd.isnull(data_n.distance_miles).sum()
print(is_null1)

In [None]:
fig=plt.figure(figsize=(11, 8))
ax1=fig.add_subplot(1,1,1)
ax1.scatter(data_n.distance_miles,data_n.fare_amount,  color='g', alpha=0.2)
ax1.set_title('График расходов, связанных с расстоянием')
ax1.set_xlabel("Дистанция")
ax1.set_ylabel("Оплата")

fig3=plt.figure(figsize=(11, 8))
ax5=fig3.add_subplot(1,1,1)
ax5.scatter(data_n.passenger_count, data_n.fare_amount, color='green')
ax5.set_title('График оплаты в зависимости от количества пассажиров')
ax5.set_xlabel('Кол-во пасажиров')
ax5.set_ylabel('Оплата')

data_n['year'] = data_n.pickup_datetime.apply(lambda t: t.year)
data_n['weekday'] = data_n.pickup_datetime.apply(lambda t: t.weekday())
data_n['hour'] = data_n.pickup_datetime.apply(lambda t: t.hour)

fig2=plt.figure(figsize=(11, 8))
ax5=fig2.add_subplot(1,1,1)
ax5.scatter(data_n.weekday, data_n.fare_amount, color='green')
ax5.set_title('График оплаты в зависимости от дня недели')
ax5.set_xlabel('День недели')
ax5.set_ylabel('Плата')


fig2=plt.figure(figsize=(11, 8))
ax5=fig2.add_subplot(1,1,1)
ax5.scatter(data_n.hour, data_n.fare_amount, color='green')
ax5.set_title('График оплаты в зависимости от времени вызова')
ax5.set_xlabel('Время вызова')
ax5.set_ylabel('Плата')

In [None]:
#Парная линейная регрессия методом наименьших квадратов

lr = LinearRegression()
x = data_n.distance_miles
y = data_n.fare_amount

x = np.transpose(np.atleast_2d(x))
lr.fit(x,y)
y_pred = lr.predict(x)
mse_lin_rm = mean_squared_error(y, y_pred)
rmse_lin_rm = np.sqrt(mse_lin_rm)
r2_lin_rm = r2_score(y, y_pred) 

kf = KFold(n_splits=5, random_state=None, shuffle=False)
mse_lin_rm_kf = []
r2_lin_rm_kf = []  
for train_index, test_index in kf.split(x):
    lr.fit(x[train_index],y[train_index])
    mse_lin_rm_kf.append(mean_squared_error(y[test_index], lr.predict(x[test_index])))
    r2_lin_rm_kf.append(r2_score(y[test_index], lr.predict(x[test_index])))
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(mse_lin_rm_kf), np.std(mse_lin_rm_kf) * 2))
print("Mean R^2: %0.2f" % (np.mean(r2_lin_rm_kf)))

fig=plt.figure(figsize=(11, 8))
ax5=fig.add_subplot(1,1,1)

ax5.scatter(data_n.distance_miles,data_n.fare_amount, color='g', alpha=0.2, norm=0.3)
ax5.plot(x,y_pred)

ax5.set_title('Парная линейная регрессия методом наименьших квадратов ')
ax5.set_xlabel('Дистанция ')
ax5.set_ylabel('лата')

In [None]:
x_mul = data_n[['passenger_count','distance_miles']]
y = data_n.fare_amount

X_train, X_test, y_train, y_test = train_test_split(x_mul, y, test_size=0.4, random_state=0)    

lr_mul = LinearRegression()
lr_mul.fit(x_mul,y)
p = lr_mul.predict(x_mul)
mse_lin_mul = mean_squared_error(y, p)
fig=plt.figure(figsize=(11, 8))
mulreg=fig.add_subplot(1,1,1)
mulreg.scatter(p, y, color='g', alpha=0.1)
mulreg.set_title('Множественная линейная регрессия')

In [None]:
x_mul = data_n[['passenger_count','distance_miles']]
y = data_n.fare_amount

X_train, X_test, y_train, y_test = train_test_split(x_mul, y, test_size=0.4, random_state=0)    

lr_mul = Ridge(alpha=0.1)  # alpha — величина регуляризации
lr_mul.fit(x_mul,y)
p = lr_mul.predict(x_mul)
mse_lin_mul = mean_squared_error(y, p)
fig=plt.figure(figsize=(11, 8))
mulreg=fig.add_subplot(1,1,1)
mulreg.scatter(p, y, color='g', alpha=0.1)
mulreg.set_title('Множественная линейная регрессия (Ridge)')