In [1]:
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
url_names = 'https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv'

dat_names = pd.read_csv(url_names)

### Challenge 1

In [7]:
ann = dat_names.query('name == "Ann"')
# print(ann)
# ann.filter('name, year, AZ')

23525
23526
23527
23528
23529
...
23626
23627
23628
23629
23630


In [14]:
ann_az = (ann
.groupby('year')
.agg(az_total = ('AZ', sum)
)
# .assign(another_column_can_make = lambda x: function goes here)
.reset_index()
)
ann_az

Unnamed: 0,year,az_total
0,1910,0.0
1,1911,0.0
2,1912,0.0
3,1913,6.0
4,1914,6.0
...,...,...
101,2011,0.0
102,2012,0.0
103,2013,0.0
104,2014,0.0


In [13]:
ann_co = (ann
.groupby('year')
.agg(co_total = ('CO', sum)
)
# .assign(another_column_can_make = lambda x: function goes here)
.reset_index()
)
ann_co

Unnamed: 0,year,co_total
0,1910,25.0
1,1911,19.0
2,1912,31.0
3,1913,27.0
4,1914,30.0
...,...,...
101,2011,0.0
102,2012,0.0
103,2013,0.0
104,2014,5.0


In [17]:
chart_az = (alt.Chart(ann_az)
    .mark_line(color='orange')
    .encode(x = alt.X('year', axis = alt.Axis(format = 'd', title = 'Year')), 
            y = alt.Y('az_total', axis=alt.Axis(title = 'Count of Ann')))
    .properties(
        height = 150,
        width = 500,
        title = {'text': 'The history of Ann in Colorado (blue) and Arizona (orange)'}
    )
)
chart_az

In [18]:
chart_co = (alt.Chart(ann_co)
    .mark_line(color='blue')
    .encode(x = alt.X('year', axis = alt.Axis(format = 'd', title = 'Year')), 
            y = alt.Y('co_total', axis=alt.Axis(title = 'Count of Ann')))
    .properties(
        height = 150,
        width = 500,
        title = {'text': 'The history of Ann in Colorado (blue) and Arizona (orange)'}
    )
)
chart_co

In [19]:
chart_az + chart_co

### Challenge 2

In [33]:
chal_2 = (dat_names
.groupby('year')
.agg(id_total = ('ID', sum),
    total = ('Total', sum)
)
.assign(percent = lambda x: (x.id_total / x.total)*100)
.sort_values(by = 'percent', ascending = False)
.reset_index()
)
first5 = chal_2.head(5)
print(first5.to_markdown())

|    |   year |   id_total |       total |   percent |
|---:|-------:|-----------:|------------:|----------:|
|  0 |   1978 |    14460   | 2.45557e+06 |  0.588866 |
|  1 |   1979 |    14924.5 | 2.55866e+06 |  0.583293 |
|  2 |   1980 |    15278.5 | 2.64366e+06 |  0.57793  |
|  3 |   1977 |    13977   | 2.46175e+06 |  0.567766 |
|  4 |   1981 |    14897.5 | 2.64723e+06 |  0.562758 |


### Challenge 3

In [44]:
bob = pd.Series([np.nan, 18, 22, 45, 31, np.nan, 85, 38, 129, 800, 22, 5])
dev = bob.dropna()
st_dev = np.std(dev)
chal3 = bob.replace(np.nan, st_dev)
print(chal3.to_markdown())

|    |       0 |
|---:|--------:|
|  0 | 229.524 |
|  1 |  18     |
|  2 |  22     |
|  3 |  45     |
|  4 |  31     |
|  5 | 229.524 |
|  6 |  85     |
|  7 |  38     |
|  8 | 129     |
|  9 | 800     |
| 10 |  22     |
| 11 |   5     |


### Challenge 4

In [45]:
dwellings_ml = pd.read_csv("https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_ml/dwellings_ml.csv")

features = dwellings_ml.drop(['numbaths','parcel'], axis = 1)
target = (dwellings_ml.numbaths > 2)*1

In [46]:
x_train, x_test, y_train, y_test = train_test_split(
features,
target,
test_size = .35,
random_state = 2021)

In [47]:
# create a classification model
classifier_RF = RandomForestClassifier()
# train the model
classifier_RF.fit(x_train, y_train)
# use your model to make predictions!
y_predicted = classifier_RF.predict(x_test)
# test how accurate those predictions are
metrics.accuracy_score(y_test, y_predicted)


0.9223192019950125

### Challenge 5

In [48]:
hp = pd.DataFrame({'name': ['Harry Potter','Hermione Granger','Ron Weasley','Draco Malfoy','Minerva McGonagall','Severus Snape','Rubeus Hagrid','Cedric Diggory','Ginny Weasley','Lord Voldemort'],'house': ['Gryffindor','Gryffindor','Gryffindor','Slytherin','Gryffindor','Slytherin','Gryffindor','Hufflepuff','Gryffindor','Slytherin'],'birth_decade': ['1980-1989', '1970-1979', '1980-1989', '1980-1989', '1920-1929', '1960-1969', '1920-1929', '1970-1979', '1980-1989', '1920-1929'],'ancestry': ['half-blood','muggleborn','pure-blood','pure-blood',np.nan,'half-blood','half-blood','pure-blood','pure-blood','half-blood'],'hogwartsStudent': [True,True,True,True,False,False,False,True,True,False]})

hp

Unnamed: 0,name,house,birth_decade,ancestry,hogwartsStudent
0,Harry Potter,Gryffindor,1980-1989,half-blood,True
1,Hermione Granger,Gryffindor,1970-1979,muggleborn,True
2,Ron Weasley,Gryffindor,1980-1989,pure-blood,True
3,Draco Malfoy,Slytherin,1980-1989,pure-blood,True
4,Minerva McGonagall,Gryffindor,1920-1929,,False
5,Severus Snape,Slytherin,1960-1969,half-blood,False
6,Rubeus Hagrid,Gryffindor,1920-1929,half-blood,False
7,Cedric Diggory,Hufflepuff,1970-1979,pure-blood,True
8,Ginny Weasley,Gryffindor,1980-1989,pure-blood,True
9,Lord Voldemort,Slytherin,1920-1929,half-blood,False


In [52]:
new_hp = hp
new_house = (new_hp.house.
        str.replace('Gryffindor', '1').
        str.replace('Hufflepuff', '2').
        str.replace('Ravenclaw', '3').
        str.replace('Slytherin', '4').
        # rename(columns = {0:'new_house'})
        astype('float'))
new_house

0    1.0
1    1.0
2    1.0
3    4.0
4    1.0
5    4.0
6    1.0
7    2.0
8    1.0
9    4.0
Name: house, dtype: float64

In [54]:
new_year = (new_hp.birth_decade
    .str.replace("\$|,|\+", "")
    .str.split("-", expand=True)
    .rename(columns = {0:'range'})
    .astype('float')
    .range
)
new_year

  new_year = (new_hp.birth_decade


0    1980.0
1    1970.0
2    1980.0
3    1980.0
4    1920.0
5    1960.0
6    1920.0
7    1970.0
8    1980.0
9    1920.0
Name: range, dtype: float64

In [58]:
new_ancestry = (new_hp.ancestry.
        str.replace('pure-blood', '1').
        str.replace('half-blood', '2').
        str.replace('muggleborn', '3').
        astype('float'))
new_ancestry

0    2.0
1    3.0
2    1.0
3    1.0
4    NaN
5    2.0
6    2.0
7    1.0
8    1.0
9    2.0
Name: ancestry, dtype: float64