In [4]:
import pandas as pd

# Adult Income Data

In [8]:
income_df = pd.read_csv("Adults_Income-1.csv")
income_df.head(3)

Unnamed: 0,age,workclass,occupation,hours-per-week,income
0,20,Private,Sales,44,<=50K
1,31,Private,Sales,38,>50K
2,24,Private,Tech-support,50,<=50K


In [10]:
# frequency distribution of our values in the categorical variable
income_df["workclass"].value_counts()

 Private         1986
 Self-emp-inc     180
 Federal-gov       51
Name: workclass, dtype: int64

In [11]:
income_df["occupation"].value_counts()

 Sales           1742
 Tech-support     475
Name: occupation, dtype: int64

In [12]:
income_df["income"].value_counts()

 <=50K    1467
 >50K      750
Name: income, dtype: int64

We only need to take care of categorical features. 
    (We do not change our target if it is categorical!!)
We first create features and target sets and then handle categorical variables.

In [13]:
# Create features and target sets
x_income, y_income = income_df.iloc[:,:-1], income_df["income"]
# Check our work
display(x_income.head(3))
display(y_income.head(3))

Unnamed: 0,age,workclass,occupation,hours-per-week
0,20,Private,Sales,44
1,31,Private,Sales,38
2,24,Private,Tech-support,50


0     <=50K
1      >50K
2     <=50K
Name: income, dtype: object

## Handleing Categorical variables

In [14]:
# Original Categorical Features dropped
# the order of colums is as follow:
#   - original numberic cols go first, followed by newly created binary vars
x_dummies = pd.get_dummies(x_income)
x_dummies.head(3)

Unnamed: 0,age,hours-per-week,workclass_ Federal-gov,workclass_ Private,workclass_ Self-emp-inc,occupation_ Sales,occupation_ Tech-support
0,20,44,0,1,0,1,0
1,31,38,0,1,0,1,0
2,24,50,0,1,0,0,1


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [33]:
# split data to train and test (80/20)
# ----- use x_dummbies and y_income in train_test_split function
# we define a KNN w/ 9 neighbors
# fit the model using training data
# evaluate the performance
# split data to train and test (75/25)
x_train, x_test, y_train, y_test = train_test_split(x_dummies, y_income, test_size=.2, random_state=0)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1773, 7)
(444, 7)
(1773,)
(444,)


In [1]:
# define the model
knn_cls_blood = KNeighborsClassifier(n_neighbors=7)

# fit (train) the model -- we pass both features and target sets
# we use fit() method to fit a model
knn_cls_blood.fit(x_train, y_train)

NameError: name 'KNeighborsClassifier' is not defined

In [35]:
# we evaluate our model's performance
# we use score() method
print("knn accuracy on train: {:.2%}".format(knn_cls.score(x_train, y_train)))
print("knn accuracy on test: {:.2%}".format(knn_cls.score(x_test, y_test)))

knn accuracy on train: 77.44%
knn accuracy on test: 71.62%


In [40]:
x_train.head(1)

Unnamed: 0,age,hours-per-week,workclass_ Federal-gov,workclass_ Private,workclass_ Self-emp-inc,occupation_ Sales,occupation_ Tech-support
1809,49,40,0,1,0,1,0


In [50]:
# make predictions
# we use predict() method -- it only takes features
# must make predictions in same way we trained model (with dummy variables)
p1 = [35, 25, 0, 0, 1, 1, 0]
p2 = [23, 45, 0, 1, 0, 0, 1]
p3 = [45, 40, 0, 1, 0, 1, 0]
knn_cls.predict([p1, p2, p3])



array([' <=50K', ' <=50K', ' >50K'], dtype=object)

# Categorical Variables Expressed in Numbers

In [59]:
df=pd.DataFrame(data= {'product_type':[1, 3, 2, 2, 1],
                         'color':['red', 'blue', 'red', 'green', 'green']})
df

Unnamed: 0,product_type,color
0,1,red
1,3,blue
2,2,red
3,2,green
4,1,green


In [60]:
pd.get_dummies(df)

Unnamed: 0,product_type,color_blue,color_green,color_red
0,1,0,0,1
1,3,1,0,0
2,2,0,0,1
3,2,0,1,0
4,1,0,1,0


In [61]:
df.dtypes

product_type     int64
color           object
dtype: object

In [62]:
# we have to change the type of product type to string first, then use get_dummies
df["product_type"] = df["product_type"].astype(str)
df.dtypes

product_type    object
color           object
dtype: object

In [63]:
pd.get_dummies(df)

Unnamed: 0,product_type_1,product_type_2,product_type_3,color_blue,color_green,color_red
0,1,0,0,0,0,1
1,0,0,1,1,0,0
2,0,1,0,0,0,1
3,0,1,0,0,1,0
4,1,0,0,0,1,0


# WestRoxbury_categorical Data

In [64]:
west = pd.read_csv("WestRoxbury_categorical.csv")
west.head(3)

Unnamed: 0,TOTAL VALUE,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,9965,1880,2436,1352,2.0,6,3,1,1,1,No,No
1,412.6,6590,1945,3108,1976,2.0,10,4,2,1,1,No,Yes_Recently
2,330.1,7500,1890,2294,1371,2.0,8,4,1,1,1,No,No


In [69]:
# Create features and target sets
x_west, y_west = west.iloc[:,1:], west.iloc[:,0]
# Check our work
display(x_west.head(3))
display(y_west.head(3))

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,9965,1880,2436,1352,2.0,6,3,1,1,1,No,No
1,6590,1945,3108,1976,2.0,10,4,2,1,1,No,Yes_Recently
2,7500,1890,2294,1371,2.0,8,4,1,1,1,No,No


0    344.2
1    412.6
2    330.1
Name: TOTAL VALUE, dtype: float64

In [82]:
# Original Categorical Features dropped
# the order of colums is as follow:
#   - original numberic cols go first, followed by newly created binary vars
x_west["FLOORS"] = x_west["FLOORS"].astype(str)
x_dummies_w = pd.get_dummies(x_west)
x_dummies_w.head(3)

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FLOORS_1.0,FLOORS_1.5,FLOORS_2.0,FLOORS_2.5,FLOORS_3.0,FIREPLACE_No,FIREPLACE_Yes,REMODEL_No,REMODEL_Yes,REMODEL_Yes_Recently
0,9965,1880,2436,1352,6,3,1,1,1,0,0,1,0,0,1,0,1,0,0
1,6590,1945,3108,1976,10,4,2,1,1,0,0,1,0,0,1,0,0,0,1
2,7500,1890,2294,1371,8,4,1,1,1,0,0,1,0,0,1,0,1,0,0


In [87]:
# ----- use x_dummbies and y_income in train_test_split function
# split data to train and test (75/25)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_dummies_w, y_west, test_size=.25, random_state=0)

print(x_train1.shape)
print(x_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

(4351, 19)
(1451, 19)
(4351,)
(1451,)


In [88]:
from sklearn.linear_model import LinearRegression

In [89]:
# define the model
lr = LinearRegression()
# fit the model
lr.fit(x_train1, y_train1)

LinearRegression()

In [90]:
# evaluate the model's performance
print('lr R2 on train:{:.2%}'.format(lr.score(x_train1, y_train1)))
print('lr R2 on test:{:.2%}'.format(lr.score(x_test1, y_test1)))

lr R2 on train:82.48%
lr R2 on test:81.06%


In [101]:
#make predictions for the first 3 rows in test set
sample1=x_test1.iloc[:3]
sample1

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FLOORS_1.0,FLOORS_1.5,FLOORS_2.0,FLOORS_2.5,FLOORS_3.0,FIREPLACE_No,FIREPLACE_Yes,REMODEL_No,REMODEL_Yes,REMODEL_Yes_Recently
1519,4026,1940,2520,1047,6,3,1,1,1,1,0,0,0,0,0,1,1,0,0
3457,7000,1848,6235,3446,12,5,3,0,1,0,0,1,0,0,0,1,1,0,0
895,4615,1956,2304,1306,6,3,1,1,1,0,1,0,0,0,0,1,1,0,0


In [98]:
lr.predict(sample1).round()

array([309., 645., 316.])

In [100]:
# actual values
y_test1.head(3)

1519    266.0
3457    689.3
895     294.5
Name: TOTAL VALUE, dtype: float64