# Model Exercises (TSA EXERCISE PROJECT ACQUIRE AND PREP)
- moved rest of notebooks to their own repo
- check repo for  explore and modeling

## Imports

In [1]:
import numpy as np
import pandas as pd

#viz
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#for presentation purposes
import warnings
warnings.filterwarnings("ignore")

#working with dates
from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt 

#evaluated performance using rmse
import statsmodels.api as sm

#holts linear trend model
from statsmodels.tsa.api import Holt

## Acquire

In [2]:
#csv to dataframe
df = pd.read_csv('GlobalLandTemperaturesByCity.csv')

In [3]:
#check out the data
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [4]:
#check out how many cities are in this dataset
df.City.value_counts

<bound method IndexOpsMixin.value_counts of 0           Århus
1           Århus
2           Århus
3           Århus
4           Århus
            ...  
8599207    Zwolle
8599208    Zwolle
8599209    Zwolle
8599210    Zwolle
8599211    Zwolle
Name: City, Length: 8599212, dtype: object>

In [5]:
#check out columns and datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [6]:
#check out shape of df
df.shape

(8599212, 7)

In [9]:
#find san antonio
sa = df[(df.City == "San Antonio")]

In [10]:
#begining of san antonio data
sa.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
6618616,1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W
6618617,1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W
6618618,1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W
6618619,1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W
6618620,1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W


In [11]:
#amount of nulls in san antonio data
sa.isnull().sum()

dt                               0
AverageTemperature               9
AverageTemperatureUncertainty    9
City                             0
Country                          0
Latitude                         0
Longitude                        0
dtype: int64

In [16]:
#shape of san antonio dataframe
sa.shape

(2325, 7)

In [17]:
#summary of columns and datatypes
sa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2325 entries, 6618616 to 6620940
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   dt                             2325 non-null   object 
 1   AverageTemperature             2316 non-null   float64
 2   AverageTemperatureUncertainty  2316 non-null   float64
 3   City                           2325 non-null   object 
 4   Country                        2325 non-null   object 
 5   Latitude                       2325 non-null   object 
 6   Longitude                      2325 non-null   object 
dtypes: float64(2), object(5)
memory usage: 145.3+ KB


In [18]:
#summary statistics
sa.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,2316.0,2316.0
mean,19.980095,0.812963
std,6.870505,0.810973
min,4.023,0.061
25%,13.74825,0.262
50%,20.3055,0.399
75%,26.686,1.24
max,32.166,5.063


## Prepare

In [33]:
#begining and end: 1820-2013
sa.index.min(), sa.index.max()

(Timestamp('1820-01-01 00:00:00'), Timestamp('2013-09-01 00:00:00'))

In [20]:
# Reassign the sale_date column to be a datetime type
sa.dt = pd.to_datetime(sa.dt)

In [21]:
# Sort rows by the date and then set the index as that date
sa = sa.set_index("dt").sort_index()

In [24]:
#ensure the change took place
sa.tail(10)

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-12-01,13.435,0.346,San Antonio,United States,29.74N,97.85W
2013-01-01,11.667,0.291,San Antonio,United States,29.74N,97.85W
2013-02-01,14.113,0.354,San Antonio,United States,29.74N,97.85W
2013-03-01,15.944,0.378,San Antonio,United States,29.74N,97.85W
2013-04-01,19.041,0.278,San Antonio,United States,29.74N,97.85W
2013-05-01,23.617,0.217,San Antonio,United States,29.74N,97.85W
2013-06-01,28.652,0.537,San Antonio,United States,29.74N,97.85W
2013-07-01,28.946,0.329,San Antonio,United States,29.74N,97.85W
2013-08-01,29.937,0.416,San Antonio,United States,29.74N,97.85W
2013-09-01,27.727,1.123,San Antonio,United States,29.74N,97.85W


In [28]:
#make new columns for year, month, and weekday
sa['month'] = sa.index.month
sa['weekday'] = sa.index.day_name()

In [29]:
sa.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,month,weekday
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W,1,Saturday
1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W,2,Tuesday
1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W,3,Wednesday
1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W,4,Saturday
1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W,5,Monday


In [30]:
#change column names
sa.rename(columns={"AverageTemperature": "avg_temp", 
                   "AverageTemperatureUncertainty": "avg_temp_uncertainty", 
                   "City": "city", 
                   "Country": "country", 
                   "Latitude": "latitude", 
                   "Longitude": "longitude"}, inplace=True)

In [31]:
sa.head()

Unnamed: 0_level_0,avg_temp,avg_temp_uncertainty,city,country,latitude,longitude,month,weekday
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W,1,Saturday
1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W,2,Tuesday
1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W,3,Wednesday
1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W,4,Saturday
1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W,5,Monday


In [34]:
#amount of nulls in san antonio data
sa.isnull().sum()

avg_temp                9
avg_temp_uncertainty    9
city                    0
country                 0
latitude                0
longitude               0
month                   0
weekday                 0
dtype: int64

In [35]:
#percentage of missing values
round(sa.isna().mean() * 100, 2)

avg_temp                0.39
avg_temp_uncertainty    0.39
city                    0.00
country                 0.00
latitude                0.00
longitude               0.00
month                   0.00
weekday                 0.00
dtype: float64

In [36]:
bool_series = pd.isnull(sa["avg_temp"])

In [37]:
sa[bool_series]

Unnamed: 0_level_0,avg_temp,avg_temp_uncertainty,city,country,latitude,longitude,month,weekday
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1821-11-01,,,San Antonio,United States,29.74N,97.85W,11,Thursday
1821-12-01,,,San Antonio,United States,29.74N,97.85W,12,Saturday
1822-01-01,,,San Antonio,United States,29.74N,97.85W,1,Tuesday
1822-02-01,,,San Antonio,United States,29.74N,97.85W,2,Friday
1822-03-01,,,San Antonio,United States,29.74N,97.85W,3,Friday
1822-09-01,,,San Antonio,United States,29.74N,97.85W,9,Sunday
1822-10-01,,,San Antonio,United States,29.74N,97.85W,10,Tuesday
1822-11-01,,,San Antonio,United States,29.74N,97.85W,11,Friday
1822-12-01,,,San Antonio,United States,29.74N,97.85W,12,Sunday


#### "Since reliable records began around 1880s, San Antonio’s average temperatures have climbed roughly 0.2 degrees Fahrenheit per decade for a total of 2.4 degrees, according to the National Oceanic and Atmospheric Administration."
- Only 9 missing avg temp values, so I will fill the nulls with the mean avg temp from one decade up.

In [46]:
#find mean of decades
first_decade = sa.loc['1823-01-01' : '1830-01-01']

In [47]:
first_decade.month

dt
1823-01-01     9.425
1823-02-01     9.864
1823-03-01    16.228
1823-04-01    20.097
1823-05-01    23.811
               ...  
1829-09-01    24.950
1829-10-01    21.003
1829-11-01    13.783
1829-12-01    11.498
1830-01-01    11.091
Name: avg_temp, Length: 85, dtype: float64

In [61]:
#created datasets from months of first decade
jan = first_decade[(first_decade.month == 1)]
feb = first_decade[(first_decade.month == 2)]
march = first_decade[(first_decade.month == 3)]
april = first_decade[(first_decade.month == 4)]
may = first_decade[(first_decade.month == 5)]
june = first_decade[(first_decade.month == 6)]
july = first_decade[(first_decade.month == 7)]
aug = first_decade[(first_decade.month == 8)]
sep = first_decade[(first_decade.month == 9)]
octo = first_decade[(first_decade.month == 10)]
nov = first_decade[(first_decade.month == 11)]
dec = first_decade[(first_decade.month == 12)]

In [62]:
#mean temps of first decade
#a few of the temps would be the same for 9 consecutive rows so just going 
# to change them one by one since there are only 9 to change
first_decade_temp_mean = first_decade.avg_temp.mean()

In [63]:
#avg temp for each month in first decade
j1 = jan.avg_temp.mean()
j2 = jan.avg_temp_uncertainty.mean()
print(j1,j2)

10.2945 2.959


In [70]:
#jan 1822
sa.at['1822-01-01', 'avg_temp'] = 10.2945
sa.at['1822-01-01', 'avg_temp_uncertainty'] = 2.959

In [64]:
#avg temp for each month in first decade
f1 = feb.avg_temp.mean()
f2 = feb.avg_temp_uncertainty.mean()
print(f1,f2)

11.775428571428572 2.8907142857142856


In [71]:
#feb 1822
sa.at['1822-02-01', 'avg_temp_uncertainty'] = 2.891
sa.at['1822-02-01', 'avg_temp'] = 11.775

In [72]:
#avg temp for each month in first decade
m1 = march.avg_temp.mean()
m2 = march.avg_temp_uncertainty.mean()
print(m1,m2)

16.193 2.6964285714285716


In [73]:
#march 1822
sa.at['1822-03-01', 'avg_temp_uncertainty'] = 2.696
sa.at['1822-03-01', 'avg_temp'] = 16.193

In [74]:
#avg temp for each month in first decade
sep1 = sep.avg_temp.mean()
sep2 = sep.avg_temp_uncertainty.mean()
print(sep1,sep2)

24.817571428571426 1.908857142857143


In [75]:
#sep 1822
sa.at['1822-09-01', 'avg_temp_uncertainty'] = 1.909
sa.at['1822-09-01', 'avg_temp'] = 24.818

In [76]:
#avg temp for each month in first decade
octo1 = octo.avg_temp.mean()
octo2 = octo.avg_temp_uncertainty.mean()
print(octo1,octo2)

20.250857142857143 2.1700000000000004


In [77]:
#oct 1822
sa.at['1822-10-01', 'avg_temp_uncertainty'] = 2.170
sa.at['1822-10-01', 'avg_temp'] = 20.251

In [78]:
#avg temp for each month in first decade
nov1 = nov.avg_temp.mean()
nov2 = nov.avg_temp_uncertainty.mean()
print(nov1,nov2)

15.372714285714286 2.5730000000000004


In [79]:
#nov 1822
sa.at['1822-11-01', 'avg_temp_uncertainty'] = 2.573
sa.at['1822-11-01', 'avg_temp'] = 15.373

In [80]:
#nov 1821
#1821-11-01
sa.at['1821-11-01', 'avg_temp_uncertainty'] = 2.573
sa.at['1821-11-01', 'avg_temp'] = 15.373

In [81]:
#avg temp for each month in first decade
dec1 = dec.avg_temp.mean()
dec2 = dec.avg_temp_uncertainty.mean()
print(dec1,dec2)

11.336428571428572 2.999285714285715


In [82]:
#dec 1821
#1821-12-01
sa.at['1821-12-01', 'avg_temp_uncertainty'] = 2.999
sa.at['1821-12-01', 'avg_temp'] = 11.336

In [83]:
#dec 1822
sa.at['1822-12-01', 'avg_temp_uncertainty'] = 2.999
sa.at['1822-12-01', 'avg_temp'] = 11.336

In [84]:
sa.isnull().sum()

avg_temp                0
avg_temp_uncertainty    0
city                    0
country                 0
latitude                0
longitude               0
month                   0
weekday                 0
dtype: int64