**Task: predict the location of the accident.**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics

# Reading and modifying data.

Firstly, **we should to read, clean and rewrite data as need**.

In [None]:
data = pd.read_csv("../input/us-accidents/US_Accidents_June20.csv")
data.head()

In [None]:
print(data.columns)

In [None]:
colsToDelete = ['ID', 'Source', 'TMC', 'Start_Time', 'End_Time',
                'End_Lat', 'End_Lng', 'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
                'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp']
data = data.drop(colsToDelete, axis=1).dropna()
data.head()

In [None]:
print(data.dtypes)

In [None]:
def findIndex(x, lst=[]):
    for i in range(len(lst)):
        if(lst[i]==x):
            return i

In [None]:
WindDirections        = list( data["Wind_Direction"].unique() )
WeatherConditions     = list( data["Weather_Condition"].unique() )
SunriseSunsets        = list( data["Sunrise_Sunset"].unique() )
CivilTwilights        = list( data["Civil_Twilight"].unique() )
NauticalTwilights     = data["Nautical_Twilight"].unique()
AstronomicalTwilights = data["Astronomical_Twilight"].unique()

In [None]:
for i in WindDirections:
    data["Wind_Direction"][ data["Wind_Direction"]==i ] = findIndex(i, WindDirections)
for i in WeatherConditions:
    data["Weather_Condition"][ data["Weather_Condition"]==i ] = findIndex(i, WeatherConditions)
for i in SunriseSunsets:
    data["Sunrise_Sunset"][ data["Sunrise_Sunset"]==i ] = findIndex(i, SunriseSunsets)
for i in CivilTwilights:
    data["Civil_Twilight"][ data["Civil_Twilight"]==i ] = findIndex(i, CivilTwilights)
for i in NauticalTwilights:
    data["Nautical_Twilight"][ data["Nautical_Twilight"]==i ] = findIndex(i, NauticalTwilights)
for i in AstronomicalTwilights:
    data["Astronomical_Twilight"][ data["Astronomical_Twilight"]==i ] = findIndex(i, AstronomicalTwilights)

data.head()

In [None]:
cols = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station",
        "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop"]
for column in cols:
    lst = []
    for i in data[column]:
        lst.append(1 if(i) else 0)
    data[column] = lst

data.head()

In [None]:
data = data.astype("float")

# Building and testing prediction model.

Ok, **data was processed**. If we want to predict location of accident **firstly we must split data on predictors and function values and check parameters on pairwise correlation**.

In [None]:
lat = data["Start_Lat"]
lng = data["Start_Lng"]
X   = data.drop(["Start_Lat", "Start_Lng"], axis=1)

X.head()

In [None]:
sns.heatmap(X.corr())

In [None]:
colsToDelete = []
corrMatrix = X.corr()
for i in corrMatrix.index:
    for j in corrMatrix.columns:
        if( abs(corrMatrix[i][j])>=0.2 and i!=j ):
            colsToDelete.append(i)
colsToDelete = list( pd.Series(colsToDelete).unique() )
X = X.drop(colsToDelete, axis=1)
X.head()

In [None]:
print(len(X.columns))

Ok, now we have good list of predictors. **Let's build prediction model. Let's begin with linear regression model.**

In [None]:
predictLat = LinearRegression().fit(X, lat)
predictLng = LinearRegression().fit(X, lng)

MSE for prediction models for latitude and longitude.

In [None]:
print( metrics.mean_squared_error(lat, predictLat.predict(X)) )

In [None]:
print( metrics.mean_squared_error(lng, predictLng.predict(X)) )

R^2 for prediction models for latitude and longitude.

In [None]:
print( metrics.r2_score(lat, predictLat.predict(X)) )

In [None]:
print( metrics.r2_score(lng, predictLng.predict(X)) )

MSLE for prediction models for latitude and longitude.

In [None]:
print( metrics.mean_squared_log_error(np.fabs(lat), np.fabs(predictLat.predict(X))) )

In [None]:
print( metrics.mean_squared_log_error(np.fabs(lng), np.fabs(predictLng.predict(X))) )

MAE for prediction models for latitude and longitude.

In [None]:
print( metrics.mean_absolute_error(lat, predictLat.predict(X)) )

In [None]:
print( metrics.mean_absolute_error(lng, predictLng.predict(X)) )

As we see, different metrics give different answers about quality of linear regression model as prediction model of begining of accident. But main metrics show, that **built regression model is enouth good to usage**. 
We have working prediction models, but can we build a better model using a higher degree regression model? Firstly, let's check it for quadratic regression models.

In [None]:
newX = pd.DataFrame( PolynomialFeatures(degree=2).fit_transform(X) )
newX.head()

In [None]:
sns.heatmap(newX.corr())

In [None]:
colsToDelete = []
corrMatrix = newX.corr()
for i in corrMatrix.index:
    for j in corrMatrix.columns:
        if( abs(corrMatrix[i][j])>=0.2 and i!=j ):
            colsToDelete.append(i)
colsToDelete = list( pd.Series(colsToDelete).unique() )
newX = newX.drop(colsToDelete, axis=1)
newX.head()

In [None]:
predictLatD2 = LinearRegression().fit(newX, lat)
predictLngD2 = LinearRegression().fit(newX, lng)

MSE for prediction models for latitude and longitude.

In [None]:
print( metrics.mean_squared_error(lat, predictLatD2.predict(newX)) )

In [None]:
print( metrics.mean_squared_error(lng, predictLngD2.predict(newX)) )

R^2 for prediction models for latitude and longitude.

In [None]:
print( metrics.r2_score(lat, predictLatD2.predict(newX)) )

In [None]:
print( metrics.r2_score(lng, predictLngD2.predict(newX)) )

MSLE for prediction models for latitude and longitude.

In [None]:
print( metrics.mean_squared_log_error(np.fabs(lat), np.fabs(predictLatD2.predict(newX))) )

In [None]:
print( metrics.mean_squared_log_error(np.fabs(lng), np.fabs(predictLngD2.predict(newX))) )

MAE for prediction models for latitude and longitude.

In [None]:
print( metrics.mean_absolute_error(lat, predictLatD2.predict(newX)) )

In [None]:
print( metrics.mean_absolute_error(lng, predictLngD2.predict(newX)) )

As we see, prediction models, built using quadratic regression model, are worth. It may mean, that in this situation linear regression models are best variant for building prediction models.