# Ex2 - Raz Bareli

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


### 1.a)
As we've seen in class, we can try and fit a simple model to the data that we do have, and then predict the null values.
In order to choose a model, we can split the data to train and test (the data without the null values). Then we can train, say, 3 different models on the train set, and see which one performed best on the test set.
For this question I will choose the Random Forest Classifier / Regressor, accordingly, since they are very good models, and at the same time simple enough.

In [2]:
df = pd.read_csv("ex2.csv")
df.isna().sum()

Unnamed: 0                      0
incident_id                     0
date                            0
state                           0
city_or_county                  0
address                      2044
n_killed                        0
n_injured                       0
congressional_district       1494
gun_stolen                  12430
gun_type                    12423
incident_characteristics       42
latitude                     1018
location_description        24799
longitude                    1018
n_guns_involved             12423
notes                       10211
participant_age             11658
participant_age_group        5419
participant_gender           4659
participant_name            15327
participant_relationship    28033
participant_status           3530
participant_type             3160
state_house_district         4826
state_senate_district        3997
dtype: int64

For the categorical feature, I'll choose participant_gender. For the prediction, I'll use the state' n_killed and n_injured features, since they sound the most relevant. For example, I don't think there is a connection between gender and date, so adding this feature to the prediction can harm it.

Before that, we'll process the gender to 3 categories:
Male, Female, Both:

In [3]:
gender_no_null = df[~df['participant_gender'].isnull()]

gender_no_null.loc[gender_no_null['participant_gender'].str.contains('Female', regex=True) &                         gender_no_null['participant_gender'].str.contains('Male', regex=True), ['participant_gender']] = "Both"
gender_no_null.loc[gender_no_null['participant_gender'].str.contains('Female', regex=True), ['participant_gender']] = "Female"
gender_no_null.loc[gender_no_null['participant_gender'].str.contains('Male', regex=True), ['participant_gender']] = "Male"


gender_no_null['participant_gender'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


array(['Male', 'Both', 'Male', ..., 'Male', 'Male', 'Male'], dtype=object)

Now will choose the features to be part of the model, and prepare them to the training.

In [4]:
gender_null = df[df['participant_gender'].isnull()]
gender_null = gender_null[['state', 'n_killed', 'n_injured', 'participant_gender']]
gender_null = pd.get_dummies(gender_null, columns=['state'])

gender_no_null = gender_no_null[['state', 'n_killed', 'n_injured', 'participant_gender']]
gender_no_null = pd.get_dummies(gender_no_null, columns=['state'])

X = gender_no_null.drop(columns=['participant_gender'])
y = gender_no_null['participant_gender']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [6]:
rfc = RandomForestClassifier(100,max_depth=20)
rfc.fit(X_train, y_train)
rfc.score(X_test,y_test)

0.8178883175893819

And now for the prediction:

In [7]:
gender_null['participant_gender'] = rfc.predict(gender_null.drop(columns=["participant_gender"]))
gender_null['participant_gender']

0        Male
2        Male
5        Male
7        Male
18       Male
         ... 
29970    Male
29975    Male
29984    Male
29987    Male
29997    Male
Name: participant_gender, Length: 4659, dtype: object

For the continuous variable we'll choose longitude, and we'll use the regression random forest
This time we will use different set of features for the prediction: state and city_or_county.
It's very likely to predict longitude based on the city, at least with minor errors.

In [8]:
df = pd.read_csv("ex2.csv")

df_longitude = df[['state', 'city_or_county', 'longitude']]
df_longitude = pd.get_dummies(df_longitude, columns=['state', 'city_or_county'])

null_longitude = df_longitude[df_longitude['longitude'].isnull()]
no_null_longitude = df_longitude[~df_longitude['longitude'].isnull()]

In [9]:
X = no_null_longitude.drop(columns=['longitude'])
y = no_null_longitude['longitude']
X

Unnamed: 0,state_Alabama,state_Alaska,state_Arizona,state_Arkansas,state_California,state_Colorado,state_Connecticut,state_Delaware,state_District of Columbia,state_Florida,...,city_or_county_Yukon,city_or_county_Yulee,city_or_county_Yuma,city_or_county_Zanesville,city_or_county_Zebulon,city_or_county_Zephyrhills,city_or_county_Zillah,city_or_county_Zion,city_or_county_Zionsville,city_or_county_Zwolle
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [11]:
rfr = RandomForestRegressor(100, max_depth=20)
rfr.fit(X_train, y_train)
rfr.score(X_test,y_test)

0.514768418781371

And now for the prediction:

In [14]:
null_longitude['longitude'] = rfr.predict(null_longitude.drop(columns=["longitude"]))
null_longitude['longitude']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_longitude['longitude'] = rfr.predict(null_longitude.drop(columns=["longitude"]))


18       -86.151538
192      -86.151538
204      -86.151538
213      -86.151538
222     -119.739610
            ...    
29838    -86.151538
29883    -86.151538
29949    -86.151538
29954    -86.151538
29962    -86.151538
Name: longitude, Length: 1018, dtype: float64

### 1.b)

In [17]:
df['state'].value_counts()

Illinois                2165
California              1992
Florida                 1863
Texas                   1719
New York                1258
Ohio                    1236
Georgia                 1144
Pennsylvania            1131
North Carolina          1086
Louisiana               1072
Tennessee                984
South Carolina           856
Missouri                 807
Michigan                 790
Indiana                  746
Virginia                 740
Maryland                 724
Massachusetts            714
New Jersey               688
Alabama                  676
Wisconsin                594
Kentucky                 523
Mississippi              476
Washington               445
Oklahoma                 410
Colorado                 407
District of Columbia     404
Connecticut              365
Arkansas                 326
Minnesota                311
Iowa                     306
Oregon                   304
Arizona                  290
Kansas                   266
Nebraska      