In [1]:
import pandas as pd
import altair as alt
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/star-wars-survey/StarWars.csv'
sw_responses = pd.read_csv(url, encoding='ISO-8859-1', header=None, skiprows=2) # skips first 2 rows
sw_questions = pd.read_csv(url, encoding='ISO-8859-1', header=None, nrows=2) # reads in first 2 rows

In [3]:
print(sw_questions)

             0                                                  1   \
0  RespondentID  Have you seen any of the 6 films in the Star W...   
1           NaN                                           Response   

                                                  2   \
0  Do you consider yourself to be a fan of the St...   
1                                           Response   

                                                  3   \
0  Which of the following Star Wars films have yo...   
1           Star Wars: Episode I  The Phantom Menace   

                                            4   \
0                                          NaN   
1  Star Wars: Episode II  Attack of the Clones   

                                            5   \
0                                          NaN   
1  Star Wars: Episode III  Revenge of the Sith   

                                  6   \
0                                NaN   
1  Star Wars: Episode IV  A New Hope   

                            

## Question 1

In [21]:
questions = (sw_questions.iloc[0,:]
                .replace('Have you seen any of the 6 films in the Star Wars franchise?', 'have_seen_any')
                .replace('Do you consider yourself to be a fan of the Star Wars film franchise?', 'fan_sw')
                .replace('Which of the following Star Wars films have you seen? Please select all that apply.', 'seen_any_')
                .replace('Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.', 'film_rank_')
                .replace('Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.', 'char_rateing_')
                .replace('Which character shot first?', 'shot_first')
                .replace('Are you familiar with the Expanded Universe?', 'familiar_expanded')
                .replace('Do you consider yourself to be a fan of the Expanded Universe?æ', 'fan_expanded')
                .replace('Do you consider yourself to be a fan of the Star Trek franchise?', 'star_trek_fan')
                .str.lower()
                .str.replace(' ', '_')
                .ffill()
            )

response_questions = (sw_questions.iloc[1,:]
                        .replace('Response', '')
                        .str.replace('Star Wars: Episode ', '')
                        .str.lower()
                        .str.replace(' ', '_')
                        .fillna('')
                    )

column_names = questions + response_questions

In [22]:
sw_responses.columns = column_names
print(column_names.to_markdown())


|    | 0                                   |
|---:|:------------------------------------|
|  0 | respondentid                        |
|  1 | have_seen_any                       |
|  2 | fan_sw                              |
|  3 | seen_any_i__the_phantom_menace      |
|  4 | seen_any_ii__attack_of_the_clones   |
|  5 | seen_any_iii__revenge_of_the_sith   |
|  6 | seen_any_iv__a_new_hope             |
|  7 | seen_any_v_the_empire_strikes_back  |
|  8 | seen_any_vi_return_of_the_jedi      |
|  9 | film_rank_i__the_phantom_menace     |
| 10 | film_rank_ii__attack_of_the_clones  |
| 11 | film_rank_iii__revenge_of_the_sith  |
| 12 | film_rank_iv__a_new_hope            |
| 13 | film_rank_v_the_empire_strikes_back |
| 14 | film_rank_vi_return_of_the_jedi     |
| 15 | char_rateing_han_solo               |
| 16 | char_rateing_luke_skywalker         |
| 17 | char_rateing_princess_leia_organa   |
| 18 | char_rateing_anakin_skywalker       |
| 19 | char_rateing_obi_wan_kenobi         |
| 20 | cha

## Question 2

In [5]:
print(sw_responses.query('gender == "Male"')
                .have_seen_any
                .value_counts(normalize=True).to_markdown())

|     |   have_seen_any |
|:----|----------------:|
| Yes |        0.851107 |
| No  |        0.148893 |


In [6]:
print(sw_responses.query('gender == "Female"')
                .have_seen_any
                .value_counts(normalize=True).to_markdown())

|     |   have_seen_any |
|:----|----------------:|
| Yes |        0.723133 |
| No  |        0.276867 |


### who shot first:

In [7]:
hanshotfirst = sw_responses.shot_first.value_counts(normalize=True).reset_index()
hanshotfirst['percent'] = round(hanshotfirst.shot_first*100, 0)
hanshotfirst

Unnamed: 0,index,shot_first,percent
0,Han,0.392512,39.0
1,I don't understand this question,0.369565,37.0
2,Greedo,0.237923,24.0


In [8]:
shot_chart = (alt.Chart(hanshotfirst)
.mark_bar().encode(
    x=alt.X('percent', axis=None),
    y=alt.Y('index', axis=alt.Axis(title=''))
).properties(
    width=300,
    height=100,
    title= {'text': 'Who Shot First?', 'subtitle': 'According to 834 respondents'}
))

shot_labels = shot_chart.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='percent'
)

(shot_chart + shot_labels).properties(height=900)

### seen movie

In [9]:
watched = sw_responses.filter(regex='^seen_').dropna(how='all')
len(watched)
watched


Unnamed: 0,seen_anyi__the_phantom_menace,seen_anyii__attack_of_the_clones,seen_anyiii__revenge_of_the_sith,seen_anyiv__a_new_hope,seen_anyv_the_empire_strikes_back,seen_anyvi_return_of_the_jedi
0,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
2,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,,,
3,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
4,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
5,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
...,...,...,...,...,...,...
1180,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
1181,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
1182,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi
1184,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi


In [10]:
watched_percent = round(watched.notnull().sum() / len(watched), 2).reset_index(name='percent')
watched_percent['percentfull'] = round(watched_percent.percent*100, 0)
watched_percent

Unnamed: 0,index,percent,percentfull
0,seen_anyi__the_phantom_menace,0.81,81.0
1,seen_anyii__attack_of_the_clones,0.68,68.0
2,seen_anyiii__revenge_of_the_sith,0.66,66.0
3,seen_anyiv__a_new_hope,0.73,73.0
4,seen_anyv_the_empire_strikes_back,0.91,91.0
5,seen_anyvi_return_of_the_jedi,0.88,88.0


In [11]:
watched_chart = (alt.Chart(watched_percent)
.mark_bar().encode(
    x=alt.X('percent', axis=None),
    y=alt.Y('index', axis=alt.Axis(title=''))
).properties(
    title= {'text': "Which 'Star Wars' Movies Have You Seen?", 'subtitle': 'Of 835 respondents'}
))

watched_labels = watched_chart.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='percentfull'
)

(watched_chart + watched_labels).properties()

## Question 3

In [12]:
# drop no for seen any
q3 = sw_responses.query('have_seen_any == "Yes"')

In [13]:
#Create a new column that converts the age ranges to a single number. Drop the age range categorical column.
ml_age = (q3.age
    .str.split("-", expand= True)
    .rename(columns = {0:'age_min', 1:'age_max'})
    .apply(lambda x: x.str.replace("> ", ""))
    .astype('float')
    .age_min
    )

In [14]:
#Create a new column that converts the school groupings to a single number. Drop the school categorical column.
ml_school = (q3.education.
        str.replace('Less than high school degree', '9').
        str.replace('High school degree', '12').
        str.replace('Some college or Associate degree', '14').
        str.replace('Bachelor degree', '16').
        str.replace('Graduate degree', '20').
        astype('float'))

In [15]:
#Create a new column that converts the income ranges to a single number. Drop the income range categorical column.
ml_income = (q3.household_income
    .str.replace("\$|,|\+", "")
    .str.split("-", expand=True)
    .rename(columns = {0:'income_min', 1:'income_max'})
    .astype('float')
    .income_min
)

  ml_income = (q3.household_income


In [16]:
#One-hot encode all remaining categorical columns.
ml_onehot = pd.get_dummies(q3.filter(['fan_sw', 'seen__i__the_phantom_menace',
       'seen__ii__attack_of_the_clones', 'seen__iii__revenge_of_the_sith',
       'seen__iv__a_new_hope', 'seen__v_the_empire_strikes_back',
       'seen__vi_return_of_the_jedi', 'viewhan_solo', 'viewluke_skywalker',
       'viewprincess_leia_organa', 'viewanakin_skywalker',
       'viewobi_wan_kenobi', 'viewemperor_palpatine', 'viewdarth_vader',
       'viewlando_calrissian', 'viewboba_fett', 'viewc-3p0', 'viewr2_d2',
       'viewjar_jar_binks', 'viewpadme_amidala', 'viewyoda', 'shotfirst',
       'familiar_eu', 'fan_eu', 'fan_st', 'gender', 'age', 'household_income',
       'education', 'location_(census_region)']), drop_first=True)


In [25]:
# combine all the new columns into a machine learning dataset
starwars_ml = pd.concat([ml_onehot, 
                         q3.filter(['film_ranki__the_phantom_menace', 'film_rankii__attack_of_the_clones',
       'film_rankiii__revenge_of_the_sith', 'film_rankiv__a_new_hope',
       'film_rankv_the_empire_strikes_back', 'film_rankvi_return_of_the_jedi']),
                         ml_age, 
                         ml_school, 
                         ml_income], axis=1)

Unnamed: 0,fan_sw_Yes,gender_Male,age_30-44,age_45-60,age_> 60,"household_income_$100,000 - $149,999","household_income_$150,000+","household_income_$25,000 - $49,999","household_income_$50,000 - $99,999",education_Graduate degree,...,location_(census_region)_West South Central,film_ranki__the_phantom_menace,film_rankii__attack_of_the_clones,film_rankiii__revenge_of_the_sith,film_rankiv__a_new_hope,film_rankv_the_empire_strikes_back,film_rankvi_return_of_the_jedi,age_min,education,income_min
2,0,1,0,0,0,0,0,0,0,0,...,0,1.0,2.0,3.0,4.0,5.0,6.0,18.0,12.0,0.0
3,1,1,0,0,0,1,0,0,0,0,...,0,5.0,6.0,1.0,2.0,4.0,3.0,18.0,14.0,100000.0
4,1,1,0,0,0,1,0,0,0,0,...,0,5.0,4.0,6.0,2.0,1.0,3.0,18.0,14.0,100000.0
5,1,1,0,0,0,0,0,1,0,0,...,0,1.0,4.0,3.0,6.0,5.0,2.0,18.0,16.0,25000.0
8,1,1,0,0,0,0,0,0,0,0,...,0,5.0,4.0,6.0,2.0,1.0,3.0,18.0,14.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,1,0,0,1,0,0,0,0,0,0,...,0,3.0,4.0,5.0,2.0,1.0,6.0,45.0,14.0,0.0
1181,1,0,0,0,0,0,0,0,0,0,...,0,5.0,4.0,6.0,3.0,2.0,1.0,18.0,14.0,0.0
1182,1,0,1,0,0,0,0,0,1,0,...,0,4.0,5.0,6.0,2.0,3.0,1.0,30.0,16.0,50000.0
1184,1,0,0,1,0,1,0,0,0,0,...,0,4.0,3.0,6.0,5.0,2.0,1.0,45.0,14.0,100000.0


In [29]:
starwars_ml = starwars_ml.dropna()

In [30]:
# Create features (x) and target (y)
features = starwars_ml.drop(['income_min'], axis=1)
target = (starwars_ml.income_min >= 50000) * 1
target.value_counts(dropna=False)

1    427
0    245
Name: income_min, dtype: int64

## Question 4

In [34]:
x_train, x_test, y_train, y_test = train_test_split(
    features, 
    target, 
    test_size = .3, 
    random_state = 76)

In [35]:
# create a classification model
classifier_GB = GradientBoostingClassifier()

# train the model
classifier_GB.fit(x_train, y_train)

# use your model to make predictions!
y_predicted = classifier_GB.predict(x_test)

# test how accurate those predictions are
metrics.accuracy_score(y_test, y_predicted)

1.0

In [36]:
print(metrics.classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        69
           1       1.00      1.00      1.00       133

    accuracy                           1.00       202
   macro avg       1.00      1.00      1.00       202
weighted avg       1.00      1.00      1.00       202

