In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyforest
!pip install xgboost
!pip install squarify

In [None]:
import pyforest
import squarify


#regression
import xgboost as xg 
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance


#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error
from sklearn.metrics import mean_squared_error as MSE



**Data Preparation**

In [None]:
State_time_series=pd.read_csv("/kaggle/input/zecon/State_time_series.csv", parse_dates=True)
State_time_series.Date=pd.to_datetime(State_time_series.Date)
State_time_series['Year'] = State_time_series.Date.dt.year
State_time_series.head()

In [None]:
State_time_series.columns

**Feature Selection**

In [None]:
State_time_series=State_time_series[['Date','Year', 'RegionName','MedianListingPrice_AllHomes', 'DaysOnZillow_AllHomes','Sale_Prices','ZHVI_AllHomes', 'ZHVI_BottomTier','ZHVI_TopTier','ZHVI_MiddleTier']]

In [None]:
State_time_series.head()

In [None]:
State_time_series.isna().sum()

**Treating Null Values**

In [None]:

State_time_series['DaysOnZillow_AllHomes'].fillna(value=State_time_series['DaysOnZillow_AllHomes'].median(), inplace=True)
State_time_series['Sale_Prices'].fillna(value=State_time_series['Sale_Prices'].median(), inplace=True)
State_time_series['ZHVI_AllHomes'].fillna(value=State_time_series['ZHVI_AllHomes'].median(), inplace=True)
State_time_series['ZHVI_BottomTier'].fillna(value=State_time_series['ZHVI_BottomTier'].median(), inplace=True)
State_time_series['ZHVI_TopTier'].fillna(value=State_time_series['ZHVI_TopTier'].median(), inplace=True)
State_time_series['ZHVI_MiddleTier'].fillna(value=State_time_series['ZHVI_MiddleTier'].median(), inplace=True)
State_time_series['MedianListingPrice_AllHomes'].fillna(value=State_time_series['MedianListingPrice_AllHomes'].mean(), inplace=True)

In [None]:
State_time_series.isna().sum()

**Median Price Per Square feet**

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['Year','MedianListingPrice_AllHomes']].groupby(['Year'])['MedianListingPrice_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('Median Price',fontsize = 15)
plt.title('Real Estate Listing Prices Per SquareFoot in US',fontsize = 23)




**Permits**

In [None]:
Permits=pd.read_csv("/kaggle/input/factors-effecting-housing-price-in-us/PERMIT.csv", parse_dates=True)
Permits.Date=pd.to_datetime(Permits.Date)
Permits['Year'] =Permits.Date.dt.year
Permits= Permits[['Year','Permit']]
Permits=Permits.groupby('Year').sum().astype('int64')
Permits.head()

In [None]:
State_time_series = State_time_series.merge(Permits, on="Year")
State_time_series.head()

In [None]:
State_time_series['Permit'].isna().sum()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['Permit'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('Permits for New houses',fontsize = 15)
plt.title('Number of Permits, by Year',fontsize = 23)



In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['Permit','ZHVI_AllHomes']].groupby(['Permit'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Number of Permits(Thousands Of Units)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title('House Sale Value Decreases as Permits Increases(Supply Increases)',fontsize = 23)




**Listing Days on**

In [None]:
State_time_series['DaysOnZillow_AllHomes'].isna().sum()

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['DaysOnZillow_AllHomes','ZHVI_AllHomes']].groupby(['DaysOnZillow_AllHomes'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Number of days',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title('House Sale Value Decreases as listings ages on Zillow',fontsize = 23)




**Newly Built House**

In [None]:
Newly_built=pd.read_csv("/kaggle/input/factors-effecting-housing-price-in-us/Newly_built_house.csv", parse_dates=True)
Newly_built.Date=pd.to_datetime(Newly_built.Date)
Newly_built['Year'] = Newly_built.Date.dt.year
Newly_built= Newly_built[['Year','House']]
Newly_built=Newly_built.groupby('Year').median().astype('int64')
Newly_built.head()

In [None]:
State_time_series = State_time_series.merge(Newly_built, on="Year")
State_time_series.head()


In [None]:
State_time_series.isna().sum()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['House'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('Number of New houses(Thousands Of Units)',fontsize = 15)
plt.title('Number of New Houses, by Year',fontsize = 23)

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['House','Sale_Prices']].groupby(['House'])['Sale_Prices'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Newly built house(Thousands Of Units)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title("House Sale Value Decreases as Number of New House Increases(Supply Increases)",fontsize = 23)




**GDP**

In [None]:
GDP=pd.read_csv("/kaggle/input/factors-effecting-housing-price-in-us/GDP.csv", parse_dates=True)
GDP.Date=pd.to_datetime(GDP.Date)
GDP['Year'] = GDP.Date.dt.year
GDP= GDP[['Year','GDP']]
GDP=GDP.groupby('Year').median().astype('int64')
GDP.head()

In [None]:
State_time_series = State_time_series.merge(GDP, on="Year")
State_time_series.head()

In [None]:
State_time_series.isna().sum()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['GDP'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('GDP (Billions of Dollars)',fontsize = 15)
plt.title('GDP, by Year',fontsize = 23)

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['GDP','ZHVI_AllHomes']].groupby(['GDP'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('GDP (Billions of Dollars)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title("House Sale Value Increase as GDP Increases(Demand Increases)",fontsize=29)



**Mortgage Rates**

In [None]:
Mortgage=pd.read_csv("/kaggle/input/factors-effecting-housing-price-in-us/Mortgage.csv", parse_dates=True)
Mortgage.Date=pd.to_datetime(Mortgage.Date)
Mortgage['Year'] = Mortgage.Date.dt.year
Mortgage= Mortgage[['Year','Mortgage']]
Mortgage=Mortgage.groupby('Year').median().astype('int64')
Mortgage.tail()

In [None]:

State_time_series = State_time_series.merge(Mortgage, on="Year")
State_time_series.head()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['Mortgage'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('Mortgage Rate (In Percent)',fontsize = 15)
plt.title('Mortgage Rate, by Year',fontsize = 23)

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['Mortgage','ZHVI_AllHomes']].groupby(['Mortgage'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Mortgage Rates (In Percent)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title('House Sale Value Decreases as Mortgage Rate Increases(Demand Decreases)',fontsize=29)



**Unemployment**

In [None]:
Unemployment=pd.read_csv("../input/unemployment-by-county-us/output.csv", parse_dates=True)
Unemployment['Unemployment_Rate'] = Unemployment['Rate']
Unemployment= Unemployment[['Year','Unemployment_Rate']]
Unemployment=Unemployment.groupby('Year').median().astype('int64')
Unemployment.tail()

In [None]:

State_time_series = State_time_series.merge(Unemployment, on="Year")
State_time_series.head()

In [None]:
State_time_series.isna().sum()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['Unemployment_Rate'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('Unemployment Rate (In Percent)',fontsize = 15)
plt.title('Unemployment Rate, by Year',fontsize = 23)

**Wages**

In [None]:
Wages=pd.read_csv("/kaggle/input/factors-effecting-housing-price-in-us/Average_wages.csv", parse_dates=True)
Wages.Date=pd.to_datetime(Wages.Date)
Wages['Year'] = Wages.Date.dt.year
Wages= Wages.iloc[:,[1,2]]
Wages=Wages.groupby('Year').median().astype('int64')
Wages.tail()

In [None]:

State_time_series = State_time_series.merge(Wages, on="Year")
State_time_series.head()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['Wage'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel('Average Hourly wage(In Dollars)',fontsize = 15)
plt.title('Average Hourly wage, by Year',fontsize = 23)

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['Wage','ZHVI_AllHomes']].groupby(['Wage'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Average Hourly Wages(In Dollars)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title('House Sale Value Increase as Wage Increases(Demand Increases)',fontsize=29)



**Demographic**

In [None]:
Demographic=pd.read_csv("/kaggle/input/factors-effecting-housing-price-in-us/demographic.csv", parse_dates=True)
Demographic.Date=pd.to_datetime(Demographic.Date)
Demographic['Year'] =Demographic.Date.dt.year
Demographic= Demographic.iloc[:,[1,2]]
Demographic=Demographic.groupby('Year').mean().astype('int64')
Demographic.tail()

In [None]:
State_time_series = State_time_series.merge(Demographic, on="Year")
State_time_series.head()

In [None]:
plt.figure(figsize=(20, 6))
plt.plot(State_time_series['Year'],State_time_series['PersonNumber'], color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Year',fontsize = 15)
plt.ylabel("No. of Persons (100 Millions)",fontsize = 15)
plt.title('No. of People Between the Age 15-64, by Year',fontsize = 23)

In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['PersonNumber','ZHVI_AllHomes']].groupby(['PersonNumber'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Number of Persons Between Age 15-64 (100 Millions)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title('House Sale Value Increase as People of this group Increases(Demand Increases)',fontsize=29)



**House Property Counts in Different States**

In [None]:

fig = plt.figure(figsize=(25,21))
regions=State_time_series.RegionName.value_counts().to_frame()
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(sizes=regions['RegionName'].values,label=regions.index,
                  color=sns.color_palette('viridis', 52), alpha = 1)
ax.set_xticks([])
ax.set_yticks([])
fig=plt.gcf()
fig.set_size_inches(40,25)
plt.title("Treemap of House Property counts across different States", fontsize=18)

**All the States recorded have got same number of listings except North Dakota as you can see in upper right yellow corner. United States seems to have added when region name was not available in case on nulls so US have got lowest listings.**

**Crime Rate**

In [None]:
Crime_rates = pd.read_csv(r"../input/united-states-crime-rates-by-county/crime_data_w_population_and_crime_rate.csv")
Crime_rates.head()

In [None]:
Crime_rates['FIPS'] = Crime_rates['FIPS_ST'].astype(str) + Crime_rates['FIPS_CTY'].astype(str)
Crime_rates.head()

In [None]:
Crime_rates["FIPS"] = Crime_rates["FIPS"].astype("int64")
Crime_rates= Crime_rates[['FIPS','crime_rate_per_100000']]
Crime_rates.head()

**Hospitals**

In [None]:
hospitals = pd.read_csv(r"../input/usa-hospitals/Hospitals.csv")
hospitals.head()

In [None]:
hospitals_per_county = hospitals['COUNTYFIPS'].value_counts().to_frame()
hospitals_per_county.reset_index(level=0, inplace=True)
hospitals_per_county.rename(index=str, columns={"index": "FIPS", "COUNTYFIPS": "NumberOfHospitals"}, inplace=True)
hospitals_per_county.dropna(inplace=True)
hospitals_per_county= hospitals_per_county[hospitals_per_county["FIPS"] != "NOT AVAILABLE"]
hospitals_per_county["FIPS"] = hospitals_per_county["FIPS"].astype("int64")

hospitals_per_county.head()

**County Data**

In [None]:
county_time_series = pd.read_csv("../input/zecon/County_time_series.csv")
county_time_series.head()

In [None]:
crosswalk = pd.read_csv("../input/zecon/CountyCrossWalk_Zillow.csv")
crosswalk = crosswalk[['FIPS', 'CountyName', 'StateName']]
crosswalk["FIPS"] = crosswalk["FIPS"].astype("int64")
crosswalk.head()

**Average House Price By Region**

In [None]:
house_prices = county_time_series.groupby("RegionName").mean()
team_est = house_prices["ZHVI_AllHomes"].to_frame()
team_est.reset_index(level=0, inplace=True)
team_est.rename(index=str, columns={"RegionName": "FIPS", "ZHVI_AllHomes": "AverageHousePrice"}, inplace=True)
team_est["FIPS"] = team_est["FIPS"].astype("int64")
team_est.dropna(inplace=True)

team_est.head()

**Schools**

In [None]:
public_schools = pd.read_csv(r"../input/usa-public-schools/Public_Schools.csv")
public_schools_per_county = public_schools['COUNTYFIPS'].value_counts().to_frame()
public_schools_per_county.reset_index(level=0, inplace=True)
public_schools_per_county.rename(index=str, columns={"index": "FIPS", "COUNTYFIPS": "NumberOfSchools"}, inplace=True)
public_schools_per_county["FIPS"] = public_schools_per_county["FIPS"].astype("int64")


public_schools_per_county.head()

**Merging Dataframes**

Merging all the datasets together to create a dataset that can give a basic idea of house prices according to the data features such as unemployment_rate, schools , hospitals etc.

In [None]:
team_est = team_est.merge(crosswalk, on="FIPS")

team_est.head()

In [None]:
team_est = team_est.merge(public_schools_per_county, on="FIPS")
team_est.head()

In [None]:
team_est = team_est.merge(hospitals_per_county, on="FIPS")
team_est.head()

In [None]:
team_est = team_est.merge(Crime_rates, on="FIPS")
team_est['RegionName'] = team_est['StateName']

team_est.head()

In [None]:
State_time_series =State_time_series.merge(team_est,on='RegionName')
State_time_series.head()

In [None]:
plt.figure(figsize=(20, 6))
g=sns.barplot(x="RegionName",y="crime_rate_per_100000",data=State_time_series)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
g.set(xlabel='States', ylabel='Crime Rate per 100000', title='Crime Rate in Different States')

In [None]:
plt.figure(figsize=(20, 10))
mean_group = State_time_series[['crime_rate_per_100000','ZHVI_AllHomes']].groupby(['crime_rate_per_100000'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Crime Rate Per 100000',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
# plt.xticks(rotation = 90)
plt.title('House Sale Value Decreases as Crime Rates Increase(Demand Decreases)',fontsize=29)



In [None]:
 plt.figure(figsize=(20, 6))




g=sns.barplot(x="RegionName",y="NumberOfSchools",data=State_time_series)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
g.set(xlabel='States', ylabel='Number Of Schools ', title='Number Of School in Different States')

In [None]:
plt.figure(figsize=(20, 10))
mean_group = State_time_series[['NumberOfSchools','ZHVI_AllHomes']].groupby(['NumberOfSchools'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Number Of Schools',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
# plt.xticks(rotation = 90)
plt.title('House Sale Value Increases as Number of Schools Increase(Demand Increases)',fontsize=29)

In [None]:
plt.figure(figsize=(20, 6))

g=sns.barplot(x="RegionName",y="NumberOfHospitals",data=State_time_series)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
g.set(xlabel='States', ylabel='Number Of Hospitals ', title='Number Of Hospitals in Different States')

In [None]:
plt.figure(figsize=(20, 10))
mean_group = State_time_series[['NumberOfHospitals','ZHVI_AllHomes']].groupby(['NumberOfHospitals'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Number Of Hospitals', fontsize= 15)
plt.ylabel('ZHVI',fontsize = 15)
# plt.xticks(rotation = 90)
plt.title('House Sale Value Increases as Number of Hospitals Increase(Demand Increases)',fontsize=29)

In [None]:
plt.figure(figsize=(20, 6))
g=sns.barplot(x="RegionName",y="AverageHousePrice",data=State_time_series)
g.set_xticklabels(g.get_xticklabels(),rotation=90)
g.set(xlabel='States', ylabel='Average House Price ', title='Average House Price in Different States')

In [None]:
plt.figure(figsize=(20,6));

State_time_series.groupby(State_time_series['Year'])['ZHVI_TopTier'].median().plot(linewidth=4,c='g')
State_time_series.groupby(State_time_series['Year'])['ZHVI_MiddleTier'].median().plot(linewidth=4,c='b')
State_time_series.groupby(State_time_series['Year'])['ZHVI_BottomTier'].median().plot(linewidth=4,c='r')

# Adding a title and a subtitle

plt.title("Real Estate Properties ZHVI for different Tiers in US",fontsize = 25)
plt.legend()


In [None]:
plt.figure(figsize=(20, 6))
mean_group = State_time_series[['Unemployment_Rate','ZHVI_AllHomes']].groupby(['Unemployment_Rate'])['ZHVI_AllHomes'].mean()
plt.plot(mean_group, color=[230/255, 159/255, 0])
plt.tick_params(axis='both', which = 'major', labelsize = 18)
plt.xlabel('Unemployment Rates (In Percent)',fontsize = 15)
plt.ylabel('ZHVI',fontsize = 15)
plt.title('House Sale Value Decreases as Unemployment Rate Increases(Demand Decreases)',fontsize=29)


**How long does it take on average to sell a home?**

In [None]:
State_time_series.groupby(['Year']).DaysOnZillow_AllHomes.mean().plot(kind='line', figsize=(15,7), color=[230/255, 159/255, 0])
plt.title('Days listed on Zillow for all homes, by Year', fontsize= 14)
plt.xlabel('Year', fontsize= 12)
plt.ylabel('Days listed on Zillow', fontsize= 12)

**The average amount of days that a home stayed listed on Zillow's website has continuously decreased over the years.**

In [None]:
State_time_series.groupby(['RegionName']).DaysOnZillow_AllHomes.mean().sort_values(ascending= True).plot(kind= 'bar', figsize=(20,6),color=[230/255, 159/255, 0])
plt.title('Days listed on Zillow for all homes, by State', fontsize= 16)
plt.xlabel('State', fontsize= 14)
plt.ylabel('Days listed on Zillow', fontsize= 14)

**Feature Selection**

In [None]:
del State_time_series['Date']
del State_time_series['StateName']
del State_time_series['CountyName']
del State_time_series['FIPS']
del State_time_series['RegionName']


In [None]:
State_time_series.columns

**Co-relation is a very useful function. As, it gives a very clear quantative answer about features contribute to the target label and how by much**

In [None]:

plt.figure(figsize=(20, 25))
sns.heatmap(State_time_series.corr(), annot=True)

**Visual representation of correlation using a heatmap.**

In [None]:
State_time_series.columns

In [None]:
State_time_series.shape

**Training Model**

In [None]:
X, y = State_time_series.loc[:, State_time_series.columns != 'ZHVI_AllHomes'], State_time_series.loc[:, 'ZHVI_AllHomes'] 

In [None]:

train_X, test_X, train_y, test_y = train_test_split(X, y, 
                      test_size = 0.3, random_state = 123) 

xgb_r = xg.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3,
learning_rate = 0.1, max_depth = 5, alpha = 10, random_state=777,
n_estimators = 100) 

In [None]:
xgb_r.fit(train_X, train_y)

In [None]:
pred = xgb_r.predict(test_X) 


In [None]:
rmsle = np.sqrt(mean_squared_log_error(pred,test_y))
rmsle

In [None]:
xgb_r.feature_importances_

In [None]:
State_time_series.columns

In [None]:
plot_importance(xgb_r)
plt.show()

In [None]:
plt.figure(figsize=(25, 20))
df = pd.DataFrame(list(zip(pred, test_y)), 
               columns =['Predicted', 'Test']) 
sns.set_style("whitegrid") 
  
sns.scatterplot(x = 'Predicted', y = 'Test', data = df,hue='Test') 



**Time for some parameter tuning.**

In [None]:
xgb_r = xg.XGBRegressor(objective ='reg:linear',
n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=7) 

In [None]:
xgb_r.fit(train_X, train_y)

In [None]:
pred = xgb_r.predict(test_X) 
rmsle = np.sqrt(mean_squared_log_error(pred,test_y))
rmsle


In [None]:

plot_importance(xgb_r)
plt.show()



In [None]:
plt.figure(figsize=(25, 20))
df = pd.DataFrame(list(zip(pred, test_y)), 
               columns =['Predicted', 'Test']) 
sns.set_style("whitegrid") 
  
sns.scatterplot(x = 'Predicted', y = 'Test', data = df,hue='Test') 



In [None]:
models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),KNeighborsRegressor()]
model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','SVR','KNeighborsRegressor']
rmsle=[]
d={}
for model in range (len(models)):
    clf=models[model]
    clf.fit(train_X,train_y)
    test_pred=clf.predict(test_X)
    rmsle.append(np.sqrt(mean_squared_log_error(test_pred,test_y)))
d={'Modelling Algo':model_names,'RMSLE':rmsle}   
d

**Time for some parameter tuning. Going with Random Forest as it performed best.**

In [None]:

no_of_test=[500]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_rf.fit(train_X,train_y)
predi=clf_rf.predict(test_X)
print((np.sqrt(mean_squared_log_error(predi,test_y))))

In [None]:
clf_rf.best_params_

In [None]:
predictions = clf_rf.predict(test_X)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions,test_y)))
predictions