In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [62]:
df = pd.read_csv('/content/salaries.csv')

In [63]:
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [64]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['company_name'] = le.fit_transform(df['company'])
df['job_description'] = le.fit_transform(df['job'])
df['degree_description'] = le.fit_transform(df['degree'])

In [65]:
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k,company_name,job_description,degree_description
0,google,sales executive,bachelors,0,2,2,0
1,google,sales executive,masters,0,2,2,1
2,google,business manager,bachelors,1,2,0,0
3,google,business manager,masters,1,2,0,1
4,google,computer programmer,bachelors,0,2,1,0


In [66]:
new = df.drop(columns = ['company','job','degree'],axis = 1)

In [67]:
new_df = new.drop(columns = 'salary_more_then_100k',axis = 1)
target = df['salary_more_then_100k']

In [68]:
model = DecisionTreeClassifier()

In [None]:
model.fit(new_df,target)

In [70]:
model.score(new_df,target)

1.0

In [None]:
model.predict([[2,1,0]])

In [None]:
model.predict([[2,1,1]])

# **One Hot Encoding**

In [73]:
home = pd.read_csv('/content/homeprices.csv')

In [74]:
home['town'] = le.fit_transform(home.town)

In [75]:
x = home.drop(columns = 'price',axis = 1)
y = home['price']

In [76]:
model_linear = LinearRegression()

In [None]:
model_linear.fit(x,y)

In [78]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town',OneHotEncoder(),[0])],remainder='passthrough')

In [79]:
X = ct.fit_transform(x)

In [None]:
model.fit(X,y)

In [81]:
model.score(X,y)

1.0

In [82]:
data = pd.read_csv('/content/iris_data.csv')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [83]:
data.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [84]:
data['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [85]:
data['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [86]:
data['Species'] = le.fit_transform(data['Species'])

In [87]:
data['Species'].unique()

array([0, 1, 2])

In [88]:
nf = pd.read_csv('/content/data.csv')
nf.head()

Unnamed: 0,Id,Colour,Country
0,1,Red,USA
1,2,Blue,UK
2,3,Green,Canada
3,4,Blue,USA
4,5,Blue,USA


In [89]:
nf['Colour'].unique()

array(['Red', 'Blue', 'Green'], dtype=object)

In [90]:
nf['Country'].unique()

array(['USA', 'UK', 'Canada'], dtype=object)

In [91]:
ohe = OneHotEncoder()

In [92]:
feature_array = ohe.fit_transform(nf[['Colour','Country']]).toarray()

In [93]:
feature_array

array([[0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.]])

In [94]:
feature_lebel = ohe.categories_

In [95]:
feature_lebels = np.array(feature_lebel).ravel()

In [96]:
feature_lebels

array(['Blue', 'Green', 'Red', 'Canada', 'UK', 'USA'], dtype=object)

In [97]:
feature = pd.DataFrame(feature_array,columns = feature_lebels)

In [98]:
pd.concat([nf,feature],axis = 1)

Unnamed: 0,Id,Colour,Country,Blue,Green,Red,Canada,UK,USA
0,1,Red,USA,0.0,0.0,1.0,0.0,0.0,1.0
1,2,Blue,UK,1.0,0.0,0.0,0.0,1.0,0.0
2,3,Green,Canada,0.0,1.0,0.0,1.0,0.0,0.0
3,4,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
4,5,Blue,USA,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
295,296,Red,Canada,0.0,0.0,1.0,1.0,0.0,0.0
296,297,Green,UK,0.0,1.0,0.0,0.0,1.0,0.0
297,298,Green,UK,0.0,1.0,0.0,0.0,1.0,0.0
298,299,Red,Canada,0.0,0.0,1.0,1.0,0.0,0.0


# **Ordinal Encoding**

In [99]:
from sklearn.preprocessing import OrdinalEncoder

In [100]:
ordinal = OrdinalEncoder()

In [101]:
od = pd.read_csv('/content/ordinal.csv')

In [102]:
od.head()

Unnamed: 0,Id,Education,Country
0,1,PhD,USA
1,2,Masters,UK
2,3,Bachelors,Canada
3,4,PhD,USA
4,5,Masters,USA


In [103]:
od['Education'].unique()

array(['PhD', 'Masters', 'Bachelors'], dtype=object)

In [104]:
od['Education'] = ordinal.fit_transform(od[['Education']])

In [105]:
od.head()

Unnamed: 0,Id,Education,Country
0,1,2.0,USA
1,2,1.0,UK
2,3,0.0,Canada
3,4,2.0,USA
4,5,1.0,USA


In [106]:
education = ['PhD', 'Masters', 'Bachelors']

In [110]:
enc = OrdinalEncoder(categories=[education])

In [112]:
# od[["Education"]] = enc.fit_transform(od[["Education"]])