In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import numpy as np
import seaborn as sns

# linear modelling
import scipy.stats as st
import statsmodels.api as sm
from scipy import stats
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

# metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Census API Key
from census import Census
from config import api_key
c = Census(api_key, year=2017)

In [None]:
df=pd.read_csv("Airbnb_Texas_Rentals.csv")
df

In [None]:
df.info()

In [None]:
df.count()

In [None]:
clean_df = df.dropna()
clean_df.info()

In [None]:
# summary data frame
clean_df2 = clean_df.loc[:, ["city", "date_of_listing", "bedrooms_count", "average_rate_per_night", "latitude", "longitude"]]
clean_df2

In [None]:
clean_df2.info()

# Data Cleaning

In [None]:
clean_df2["date_of_listing"]=pd.to_datetime(clean_df2["date_of_listing"])

clean_df2.info()

In [None]:
clean_df2["year"]=[x.year for x in clean_df2.date_of_listing]
clean_df2.head()

In [None]:
unique_year = clean_df2["year"].nunique()
unique_year

In [None]:
clean_df2["month"]=[x.month for x in clean_df2.date_of_listing]
clean_df2.head()

In [None]:
clean_df2.info()

In [None]:
# mask1 = clean_df2.loc[clean_df2['year']==2012].index
# mask2 = clean_df2.loc[clean_df2['year']==2011].index
# mask3 = clean_df2.loc[clean_df2['year']==2010].index
# mask4 = clean_df2.loc[clean_df2['year']==2009].index
# mask5 = clean_df2.loc[clean_df2['year']==2008].index

In [None]:
# mask1.array

In [None]:
clean_df3 = clean_df2[~(clean_df2.year.isin([2012, 2011, 2010, 2009, 2008]))].reset_index()
clean_df3.head()

In [None]:
clean_df3.info()

In [None]:
# Create the bins in which Data will be held
# Bins are  12-1-2016, 7-1-2016, 1-1-2018
bins = [0, 3, 6, 9, 12]

# Create the names for the five bins
group_names = ["First", "Second", "Third", "Fourth"]

In [None]:
clean_df3["Date_Quarter"] = pd.cut(clean_df3["month"], bins, labels=group_names, include_lowest=True)
clean_df3

In [None]:
# summary data frame
clean_df4 = clean_df3.loc[:, ["city", "year", "Date_Quarter", "bedrooms_count", "average_rate_per_night", "latitude", "longitude"]]
clean_df4.head()

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="http")

In [None]:
location = geolocator.reverse("30.020138, -95.293996")
location.raw

In [None]:
# counties = []
# zipcodes = []

# for indx,row in clean_df4.iterrows():
#     lat=row["latitude"]
#     lon=row["longitude"]
#     location = geolocator.reverse(f"{lat}, {lon}", timeout=None)
    
#     county = location.raw["address"].get("county")
#     zipcode = location.raw["address"].get("postcode")
#     counties.append(county)
#     zipcodes.append(zipcode)

#     if indx % 100==0:
#         print(indx)
    

In [None]:
# len(counties)

In [None]:
# clean_df4.head()


In [None]:
# len(zipcodes)

In [None]:
# counties

In [None]:
# clean_df4['counties'] = counties
# clean_df4['zipcodes'] = zipcodes
# clean_df4

In [None]:
# # Export file as a CSV, without the Pandas index, but with the header - So we don't have to run the above code every
# # time we open the file. 
# clean_df4.to_csv("airbnb_counties.csv", index=False, header=True)

In [None]:
# read in airbnb_counties csv
airbnb_df=pd.read_csv("airbnb_counties.csv")
airbnb_df

In [None]:
airbnb_df.info()

In [None]:
# read in census csv
census_df=pd.read_csv("acs2017_county_data.csv")
census_df.head()

In [None]:
# Change 'Studio' room to 0
# Change value from object to numeric

bedrooms_clean = airbnb_df['bedrooms_count']

bedrooms_clean = pd.to_numeric(["0" if i=='Studio' else i for i in bedrooms_clean])
bedrooms_clean

In [None]:
# Add bedrooms into Data Frame
airbnb_df['bedrooms'] = bedrooms_clean
airbnb_df

In [None]:
airbnb_df.info()

In [None]:
# Remove $ and convert into integer
airbnb_df['average_rate_per_night'] = airbnb_df['average_rate_per_night'].str.replace('$', '').astype(int)
airbnb_df

In [None]:
rate_mean = airbnb_df.groupby("counties")["average_rate_per_night"].mean()
rate_median = airbnb_df.groupby("counties")["average_rate_per_night"].median()
bed_mean = airbnb_df.groupby("counties")["bedrooms"].mean()
bed_median = airbnb_df.groupby("counties")["bedrooms"].median()
 
    
# Assemble the resulting series into a single summary dataframe
agg_df = pd.DataFrame({"airbnb_rate_mean": rate_mean,
                       "airbnb_rate_median": rate_median,
                       "airbnb_beds_mean": bed_mean,
                        "airbnb_beds_median": bed_median}).reset_index()
agg_df

In [None]:
agg_df.info()

In [None]:
census_tx = census_df.loc[census_df.State == "Texas"].reset_index()
census_tx.head()

In [None]:
census_tx.loc[census_tx.County == "Harris County"]

In [None]:
# summary data frame
clean_tx = census_tx.loc[:, ["County", "TotalPop", "Income", "Poverty"]]
clean_tx.info()

In [None]:
clean_tx = clean_tx.rename(columns = {"County":"counties"})
clean_tx.head()

In [None]:
clean_tx.loc[census_tx.County == "Harris County"]

In [None]:
# Merge Airbnb and Census dataframes on counties to show all data
clean_tx2 = pd.merge(airbnb_df, clean_tx, on="counties")
clean_tx2.head()

In [None]:
clean_tx2.info()

In [None]:
# Merge census data with aggregation data to show mean/median for counties and nan values for counties with no data
# will skip in analysis
clean_tx3 = pd.merge(agg_df, clean_tx, on="counties", how='right')
clean_tx3.head()

In [None]:
clean_tx3.info()

In [None]:
clean_tx4 = clean_tx3.fillna(0)
clean_tx4

In [None]:
# Drop Null valus from table
clean_tx5 = clean_tx3.dropna(how='any')
clean_tx5

In [None]:
# Make Bins for Rural and Uban 

bins_1 = [0, 50000, 4600000]

# Create the names for the five bins
group_names1 = ["Rural", "Urban"]

In [None]:
clean_tx2["Urban_Rural"] = pd.cut(clean_tx2["TotalPop"], bins_1, labels=group_names1, include_lowest=True)
clean_tx2

In [235]:
clean_tx2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16799 entries, 0 to 16798
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   city                    16799 non-null  object  
 1   year                    16799 non-null  int64   
 2   Date_Quarter            16799 non-null  object  
 3   bedrooms_count          16799 non-null  object  
 4   average_rate_per_night  16799 non-null  int32   
 5   latitude                16799 non-null  float64 
 6   longitude               16799 non-null  float64 
 7   counties                16799 non-null  object  
 8   zipcodes                16030 non-null  object  
 9   bedrooms                16799 non-null  int64   
 10  TotalPop                16799 non-null  int64   
 11  Income                  16799 non-null  int64   
 12  Poverty                 16799 non-null  float64 
 13  Urban_Rural             16799 non-null  category
dtypes: category(1), float6

In [236]:
clean_tx2.describe()

Unnamed: 0,year,average_rate_per_night,latitude,longitude,bedrooms,TotalPop,Income,Poverty
count,16799.0,16799.0,16799.0,16799.0,16799.0,16799.0,16799.0,16799.0
mean,2015.414668,213.936484,30.674222,-97.054034,1.780285,1357302.0,61148.34633,14.83227
std,1.153104,424.183393,1.683263,1.338457,1.236139,1414392.0,12535.272865,4.705826
min,2013.0,10.0,25.894075,-103.690925,0.0,2123.0,32135.0,5.7
25%,2015.0,57.0,29.680868,-97.817204,1.0,214231.0,53626.0,13.2
50%,2016.0,108.0,30.27006,-97.110965,1.0,914075.0,57791.0,13.9
75%,2016.0,220.0,32.669623,-96.279612,3.0,1983675.0,68350.0,16.8
max,2017.0,10000.0,35.256299,-93.771139,13.0,4525519.0,93645.0,31.2


In [238]:
def makeHistogram(clean_tx2, column):
    plt.figure(figsize= (10,6))
    plt.hist(clean_tx2[column])
    plt.ylabel('Count')
    plt.xlabel(column)
    plt.title(f"{column} Histogram", fontweight="bold", fontsize=16)
    plt.show()

In [None]:
for col in clean_tx2.columns:
    makeHistogram(clean_tx2, col)

In [248]:
st.normaltest(clean_tx2.Income)

NormaltestResult(statistic=1205.1888429580863, pvalue=1.9795566102195194e-262)

In [None]:
# Add bin to chart
clean_tx4["Urban_Rural"] = pd.cut(clean_tx4["TotalPop"], bins_1, labels=group_names1, include_lowest=True)
clean_tx4

In [247]:
clean_tx4.describe()

Unnamed: 0,airbnb_rate_mean,airbnb_rate_median,airbnb_beds_mean,airbnb_beds_median,TotalPop,Income,Poverty
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,89.423411,69.793307,0.852482,0.753937,107951.2,49894.338583,16.335039
std,141.350116,142.574104,1.163664,1.153909,389476.9,12132.675517,5.943009
min,0.0,0.0,0.0,0.0,74.0,24794.0,2.8
25%,0.0,0.0,0.0,0.0,7072.5,42326.5,12.925
50%,0.0,0.0,0.0,0.0,18612.5,48311.0,16.15
75%,166.5457,104.875,1.688779,1.0,49294.75,55740.75,18.7
max,1191.666667,1750.0,7.0,8.0,4525519.0,93645.0,41.5


In [252]:
def makeHistogram(clean_tx4, column):
    plt.figure(figsize= (9,5))
    plt.hist(clean_tx4[column])
    plt.ylabel('Count')
    plt.xlabel(column)
    plt.title(f"{column} Histogram", fontweight="bold", fontsize=16)
    plt.show()

In [None]:
for col in clean_tx4.columns:
    makeHistogram(clean_tx4, col)

In [254]:
st.normaltest(clean_tx4.airbnb_rate_mean)

NormaltestResult(statistic=181.59813527411103, pvalue=3.685241585703303e-40)

In [255]:
st.normaltest(clean_tx4.airbnb_beds_mean)

NormaltestResult(statistic=77.63333484832768, pvalue=1.3871886661933713e-17)

In [None]:
# Urban Data
urban_cleantx_df = clean_tx4.loc[clean_tx4.Urban_Rural == "Urban"]
urban_cleantx_df

In [None]:
# Rural data
rural_cleantx_df = clean_tx4.loc[clean_tx4.Urban_Rural == "Rural"]
rural_cleantx_df

In [None]:
# Urban and rural in the dropped values dataframe
clean_tx5["Urban_Rural"] = pd.cut(clean_tx5["TotalPop"], bins_1, labels=group_names1, include_lowest=True)
clean_tx5

In [None]:
# Urban Data
urban_cleantx_df2 = clean_tx5.loc[clean_tx5.Urban_Rural == "Urban"]
urban_cleantx_df2.info()

In [None]:
# Rural data
rural_cleantx_df2 = clean_tx5.loc[clean_tx5.Urban_Rural == "Rural"]
rural_cleantx_df2.info()

# Correlations and Heatmaps

In [None]:
# Correlation between airbnb and census
corrs1 = clean_tx2.corr()
corrs1

In [None]:
# Correlation between aggregation and census
corrs2 = clean_tx3.corr()
corrs2

In [None]:
# Correlation between aggregation and census with nan filled as 0
corrs3 = clean_tx4.corr()
corrs3

In [None]:
 # Correlation between aggregation and census with null airbnb values dropped
corrs_3a = clean_tx5.corr()
corrs_3a

In [None]:
# Correlation between aggregation and census accounting for urban counties only
corrs4 = urban_cleantx_df.corr()
corrs4

In [None]:
# Correlation between aggregation and census accounting for rural counties only
corrs5 = rural_cleantx_df.corr()
corrs5

In [None]:
# Create heatmap for correlation values (must get figure knowledge from examples )
plt.figure(figsize=(9,5))
sns.heatmap(corrs1)
plt.show()
    
plt.figure(figsize=(9,5))
sns.heatmap(corrs2)
plt.show()
    
plt.figure(figsize=(9,5))
sns.heatmap(corrs3)
plt.show()

plt.figure(figsize=(9,5))
sns.heatmap(corrs4)
plt.show()

plt.figure(figsize=(9,5))
sns.heatmap(corrs5)
plt.show()


In [None]:
# Breakdown of Urban vs Rural 
plt.figure(figsize=(30,10))

plt.subplot(1,2,1)
plt.title('Urban', fontsize=32)
sns.heatmap(corrs4,  cbar=True, linecolor="k", linewidths=1)
 
plt.subplot(1,2,2) 
plt.title('Rural', fontsize=32)
sns.heatmap(corrs5,  cbar=True, linecolor="k", linewidths=1) 

plt.savefig("incomevsRuralheatmap.png")
plt.show()

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(corrs4)
plt.savefig("Urban_incomevsrate_map.png")
plt.show()

## Benchmark for Correlations

In [None]:
feature = clean_tx2[["Poverty"]]
target = clean_tx2["Income"]
    
# STATSMODELS
X = sm.add_constant(feature)
Y = target

model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [None]:
# Add the linear regression equation and line to plot
x_values = clean_tx2["Poverty"]
y_values = clean_tx2["Income"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.figure(figsize = (12,8))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(15,30000),fontsize=18,color="red")
plt.xlabel('Poverty')
plt.ylabel('Income')
plt.show()

## Regressions for connected correlations

In [None]:
feature2 = urban_cleantx_df[["Income"]]
target2 = urban_cleantx_df["airbnb_rate_mean"]
    
# STATSMODELS
X = sm.add_constant(feature2)
Y = target2

model = sm.OLS(Y,X)
results2 = model.fit()
results2.summary()

In [None]:
plt.rc('figure', figsize=(12, 7))
#plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
plt.text(0.01, 0.05, str(results2.summary()), {'fontsize': 12}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.axis('off')
plt.tight_layout()
plt.savefig('ols_incomevsrate_urban.png')

In [None]:
# Add the linear regression equation and line to plot
x_values = urban_cleantx_df["Income"] 
y_values = urban_cleantx_df["airbnb_rate_mean"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,50),fontsize=18,color="red")
plt.xlabel('Income')
plt.ylabel('Average Rate per Night')
plt.savefig("Urban_income_rate_correlation.png")
plt.show()

In [None]:
# SCIKIT_LEARN

reg = LinearRegression()
reg.fit(np.array(feature2), np.array(target2))

print(reg.coef_)
print(reg.intercept_)

print(reg.score(np.array(feature2), np.array(target2)))
reg.predict([[1000000]])

In [None]:
feature3 = urban_cleantx_df[["Income"]]
target3 = urban_cleantx_df["airbnb_beds_mean"]
    
# STATSMODELS
X = sm.add_constant(feature3)
Y = target3

model = sm.OLS(Y,X)
results3 = model.fit()
results3.summary()

In [None]:
x_values = urban_cleantx_df["Income"] 
y_values = urban_cleantx_df["airbnb_beds_mean"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,1),fontsize=18,color="red")
plt.xlabel('Income')
plt.ylabel('Average number of beds per Night')
plt.savefig("Urban_income_beds_correlation.png")
plt.show()

In [None]:
feature4 = rural_cleantx_df[["TotalPop"]]
target4 = rural_cleantx_df["airbnb_rate_mean"]
    
# STATSMODELS
X = sm.add_constant(feature4)
Y = target4

model = sm.OLS(Y,X)
results4 = model.fit()
results4.summary()

In [None]:
plt.rc('figure', figsize=(12, 7))
#plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
plt.text(0.01, 0.05, str(results4.summary()), {'fontsize': 12}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.axis('off')
plt.tight_layout()
plt.savefig('ols_popvsrate_rural.png')

In [None]:
x_values = rural_cleantx_df["TotalPop"] 
y_values = rural_cleantx_df["airbnb_rate_mean"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(30000,50),fontsize=18,color="red")
plt.xlabel('Population')
plt.ylabel('Average rate per Night')
plt.savefig("rural_pop_rate_correlation.png")
plt.show()

In [None]:
feature5 = rural_cleantx_df[["TotalPop"]]
target5 = rural_cleantx_df["airbnb_beds_mean"]
    
# STATSMODELS
X = sm.add_constant(feature4)
Y = target5

model = sm.OLS(Y,X)
results5 = model.fit()
results5.summary()

In [None]:
x_values = rural_cleantx_df["TotalPop"] 
y_values = rural_cleantx_df["airbnb_beds_mean"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(30000,0.5),fontsize=18,color="red")
plt.xlabel('Population')
plt.ylabel('Average Beds per Night')
plt.savefig("rural_pop_beds_correlation.png")
plt.show()

## Poverty and Income effects Urban vs Rural


In [None]:
feature6 = urban_cleantx_df[["Poverty"]]
target6 = urban_cleantx_df["Income"]
    
# STATSMODELS
X = sm.add_constant(feature6)
Y = target6

model = sm.OLS(Y,X)
results6 = model.fit()
results6.summary()

In [None]:
x_values = urban_cleantx_df["Poverty"] 
y_values = urban_cleantx_df["Income"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,50000),fontsize=18,color="red")
plt.xlabel('Poverty')
plt.ylabel('Income')
plt.savefig("povertyvsincome_urban.png")
plt.show()

In [None]:
feature7 = rural_cleantx_df[["Poverty"]]
target7 = rural_cleantx_df["Income"]
    
# STATSMODELS
X = sm.add_constant(feature7)
Y = target7

model = sm.OLS(Y,X)
results7 = model.fit()
results7.summary()

In [None]:
x_values = rural_cleantx_df["Poverty"] 
y_values = rural_cleantx_df["Income"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,20000),fontsize=18,color="red")
plt.xlabel('Poverty')
plt.ylabel('Income')
plt.savefig("povertyvsincome_rural.png")
plt.show()

# Breakdown of top urban areas with predictions of rentals through 2022

In [None]:
harris_df = clean_tx2.loc[clean_tx2.counties == "Harris County"]
travis_df = clean_tx2.loc[clean_tx2.counties == "Travis County"]
tarrant_df = clean_tx2.loc[clean_tx2.counties == "Tarrant County"]
dallas_df = clean_tx2.loc[clean_tx2.counties == "Dallas County"]
bexar_df = clean_tx2.loc[clean_tx2.counties == "Bexar County"]

In [None]:
results_u1 = harris_df.append(travis_df)
results_u2 = results_u1.append(tarrant_df)
results_u3 = results_u2.append(dallas_df)
urban_top = results_u3.append(bexar_df)

urban_top

In [234]:
urban_top.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7760 entries, 0 to 6272
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   city                    7760 non-null   object  
 1   year                    7760 non-null   int64   
 2   Date_Quarter            7760 non-null   object  
 3   bedrooms_count          7760 non-null   object  
 4   average_rate_per_night  7760 non-null   int32   
 5   latitude                7760 non-null   float64 
 6   longitude               7760 non-null   float64 
 7   counties                7760 non-null   object  
 8   zipcodes                7702 non-null   object  
 9   bedrooms                7760 non-null   int64   
 10  TotalPop                7760 non-null   int64   
 11  Income                  7760 non-null   int64   
 12  Poverty                 7760 non-null   float64 
 13  Urban_Rural             7760 non-null   category
dtypes: category(1), float64(

In [None]:
urban_year = urban_top.groupby('year')['city'].count().reset_index()

urban_year

In [None]:
urban_top.corr()

In [None]:
urban_year.corr()

In [None]:
plt.figure(figsize = (15,8))
g=sns.lineplot(x='year', y="city", data=urban_year, color="black", linewidth=5,)
g.yaxis.grid(False) # Hide the horizontal gridlines
g.xaxis.grid(True) # Show the vertical gridlines
plt.title("Urban - Number of Rentals per Year", size=30)
plt.xlabel("Year", size=20)
plt.ylabel("Rentals", size=20)
g.xaxis.set_major_locator(ticker.MultipleLocator(1))
g.xaxis.set_major_formatter(ticker.ScalarFormatter())

In [232]:
urban_year

Unnamed: 0,year,city
0,2013,713
1,2014,1165
2,2015,1973
3,2016,2752
4,2017,1157


## Predictions for Number of Rentals

In [None]:
feature9 = urban_year[["year"]]
target9 = urban_year["city"]

In [None]:
reg = LinearRegression()
reg.fit(np.array(feature9), np.array(target9))
reg.coef_

In [None]:
reg.score(np.array(feature9), np.array(target9))

In [257]:
print(reg.predict([[2017]]))
print(reg.predict([[2018]]))
print(reg.predict([[2019]]))
print(reg.predict([[2020]]))
print(reg.predict([[2021]]))
print(reg.predict([[2022]]))

[2047.]
[2294.5]
[2542.]
[2789.5]
[3037.]
[3284.5]


# Live Demo for Urban Rentals

In [258]:
# Demo for Urban rentals
reg.predict([[2012]])

array([809.5])

# 

In [None]:
predicted = reg.predict(feature9)
actual = target9

In [None]:
plt.figure(figsize=(10,6))

plt.scatter(predicted, actual)

plt.xlabel("Predicted")
plt.ylabel("Actual")

plt.plot(urban_year.city, urban_year.city)

plt.show()

In [None]:
# Rural data
rural_df = clean_tx2.loc[clean_tx2.Urban_Rural == "Rural"]
rural_df

In [None]:
# Count of Rural counties
rural_county = rural_df.groupby('counties')['city'].count().reset_index().sort_values(by='city', ascending=False).head(5)

rural_county

In [None]:
llano_df = clean_tx2.loc[clean_tx2.counties == "Llano County"]
burnet_df = clean_tx2.loc[clean_tx2.counties == "Burnet County"]
aransas_df = clean_tx2.loc[clean_tx2.counties == "Aransas County"]
bandera_df = clean_tx2.loc[clean_tx2.counties == "Bandera County"]
kendall_df = clean_tx2.loc[clean_tx2.counties == "Kendall County"]

In [None]:
results1 = llano_df.append(burnet_df)
results2 = results1.append(aransas_df)
results3 = results2.append(bandera_df)
rural_top = results3.append(kendall_df)

rural_top

In [233]:
rural_year = rural_top.groupby('year')['city'].count().reset_index()

rural_year

Unnamed: 0,year,city
0,2013,50
1,2014,220
2,2015,256
3,2016,297
4,2017,244


In [None]:
feature10 = rural_year[["year"]]
target10 = rural_year["city"]

In [None]:
reg2 = LinearRegression()
reg2.fit(np.array(feature10), np.array(target10))
reg2.coef_

In [259]:
print(reg2.predict([[2017]]))
print(reg2.predict([[2018]]))
print(reg2.predict([[2018]]))
print(reg2.predict([[2020]]))
print(reg2.predict([[2021]]))
print(reg2.predict([[2022]]))

[306.4]
[352.9]
[352.9]
[445.9]
[492.4]
[538.9]


# Live Demo for Rural

In [None]:
print(reg2.predict([[*]]))

# 