# Handle Categorical features

### Importing libraries

In [1]:
#these libraries deal with data
import numpy as np 
import pandas as pd
#this avoids warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
### Read the data
df = pd.read_csv('Startups_Expense.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

here you can see that we have 3 categories of values in State column

you can use **pandas** or **sklearn labelencooding and one_hot_encoding** techniques. lets see with pandas

In [4]:
data = pd.get_dummies(df['State'])

In [5]:
data.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [6]:
data = pd.get_dummies(df['State'],drop_first =True)
#using drop first = True saves us from dummy trap, 
#which means it keeps n-1 columns where n is unique columns.

In [7]:
data.head()

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


add these to our original dataset.

In [8]:
data = pd.concat([data,df], axis = 1)

In [9]:
data.drop('State',axis = 1,inplace = True)

In [10]:
data.head()

Unnamed: 0,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,1,165349.2,136897.8,471784.1,192261.83
1,0,0,162597.7,151377.59,443898.53,191792.06
2,1,0,153441.51,101145.55,407934.54,191050.39
3,0,1,144372.41,118671.85,383199.62,182901.99
4,1,0,142107.34,91391.77,366168.42,166187.94


so we got our categorical values turned into numerical, now model works fine with data

you can do steps like 
- Missing values
- outliers
- Imbalanced data
- standardization
- some visualizations

then you can build models.

### Build model

In [11]:
#lets take X,y values
X = data.iloc[:,:-1]
y = data.iloc[:,5:]

In [12]:
y.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 20,random_state = 33)

In [14]:
from sklearn.linear_model import LinearRegression
regression  =LinearRegression()
regression.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [15]:
y_pred = regression.predict(X_test)

In [16]:
from sklearn.metrics import r2_score
score = r2_score(y_test,y_pred)

In [17]:
score

0.927468592543491