# Model Exercises

## Imports

In [1]:
import numpy as np
import pandas as pd

#viz
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#for presentation purposes
import warnings
warnings.filterwarnings("ignore")

#working with dates
from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt 

#evaluated performance using rmse
import statsmodels.api as sm

#holts linear trend model
from statsmodels.tsa.api import Holt

## Acquire

In [2]:
#csv to dataframe
df = pd.read_csv('GlobalLandTemperaturesByCity.csv')

In [3]:
#check out the data
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [4]:
#check out how many cities are in this dataset
df.City.value_counts

<bound method IndexOpsMixin.value_counts of 0           Århus
1           Århus
2           Århus
3           Århus
4           Århus
            ...  
8599207    Zwolle
8599208    Zwolle
8599209    Zwolle
8599210    Zwolle
8599211    Zwolle
Name: City, Length: 8599212, dtype: object>

In [5]:
#check out columns and datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [6]:
#check out shape of df
df.shape

(8599212, 7)

In [9]:
#find san antonio
sa = df[(df.City == "San Antonio")]

In [10]:
#begining of san antonio data
sa.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
6618616,1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W
6618617,1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W
6618618,1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W
6618619,1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W
6618620,1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W


In [11]:
#amount of nulls in san antonio data
sa.isnull().sum()

dt                               0
AverageTemperature               9
AverageTemperatureUncertainty    9
City                             0
Country                          0
Latitude                         0
Longitude                        0
dtype: int64

In [16]:
#shape of san antonio dataframe
sa.shape

(2325, 7)

In [17]:
#summary of columns and datatypes
sa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2325 entries, 6618616 to 6620940
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   dt                             2325 non-null   object 
 1   AverageTemperature             2316 non-null   float64
 2   AverageTemperatureUncertainty  2316 non-null   float64
 3   City                           2325 non-null   object 
 4   Country                        2325 non-null   object 
 5   Latitude                       2325 non-null   object 
 6   Longitude                      2325 non-null   object 
dtypes: float64(2), object(5)
memory usage: 145.3+ KB


In [18]:
#summary statistics
sa.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,2316.0,2316.0
mean,19.980095,0.812963
std,6.870505,0.810973
min,4.023,0.061
25%,13.74825,0.262
50%,20.3055,0.399
75%,26.686,1.24
max,32.166,5.063


## Prepare

In [20]:
# Reassign the sale_date column to be a datetime type
sa.dt = pd.to_datetime(sa.dt)

In [21]:
# Sort rows by the date and then set the index as that date
sa = sa.set_index("dt").sort_index()

In [22]:
#ensure the change took place
sa.head()

Unnamed: 0_level_0,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1820-01-01,7.169,3.676,San Antonio,United States,29.74N,97.85W
1820-02-01,11.448,2.964,San Antonio,United States,29.74N,97.85W
1820-03-01,15.003,2.413,San Antonio,United States,29.74N,97.85W
1820-04-01,21.022,2.136,San Antonio,United States,29.74N,97.85W
1820-05-01,23.938,1.923,San Antonio,United States,29.74N,97.85W
