### IMPORTING LIBRARIES

In [1]:
# Basic Library Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import uniform, randint
# FEATURE ENGINNERING
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler

# MODELLING
from sklearn.model_selection import GridSearchCV, train_test_split ,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor , AdaBoostRegressor ,RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error , r2_score



# WARNINGS
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")



### READING AND ANALYZING DATA

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [5]:
df.dtypes

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

In [6]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [7]:
df.duplicated().sum()

0

### HANDLING FEATURES

#### 1. Cap-shape

In [8]:
cap_shape = df['cap-shape'].value_counts()

In [9]:
cap_shape_index = cap_shape[cap_shape<1000].index
df['cap-shape'] = df['cap-shape'].replace(cap_shape_index, 'o')

In [10]:
 df['cap-shape'].value_counts()

x    3656
f    3152
o    1316
Name: cap-shape, dtype: int64

#### 2. cap-surface

In [11]:
 df['cap-surface'].value_counts()

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

In [12]:
df['cap-surface'] = df['cap-surface'].replace(["f","g"], 'o')

In [13]:
df['cap-surface'].value_counts()

y    3244
s    2556
o    2324
Name: cap-surface, dtype: int64

#### 3. cap-color

In [14]:
cap_color = df['cap-color'].value_counts()

In [15]:
cap_color_index = cap_color[cap_color<1000].index
df['cap-color'] = df['cap-color'].replace(cap_color_index, 'o')

In [16]:
df['cap-color'].value_counts()

n    2284
g    1840
e    1500
y    1072
w    1040
o     388
Name: cap-color, dtype: int64

#### 4. bruises

In [17]:
df['bruises'].value_counts()

f    4748
t    3376
Name: bruises, dtype: int64

#### 5. Odor

In [18]:
odor=df['odor'].value_counts()
odor

n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: odor, dtype: int64

In [19]:
odor_index = odor[odor<1000].index
df['odor'] = df['odor'].replace(odor_index, 'o')

In [20]:
df['odor'].value_counts()

n    3528
o    2436
f    2160
Name: odor, dtype: int64

#### 6. gill-attachment , spacing , size and color

In [21]:
df['gill-attachment'].value_counts()

f    7914
a     210
Name: gill-attachment, dtype: int64

In [22]:
df['gill-spacing'].value_counts()

c    6812
w    1312
Name: gill-spacing, dtype: int64

In [23]:
df['gill-size'].value_counts()

b    5612
n    2512
Name: gill-size, dtype: int64

In [24]:
df['gill-color'].value_counts()

b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: gill-color, dtype: int64

In [25]:
df['gill-color'] = df['gill-color'].replace(["g","h"] , "gh")
df['gill-color'] = df['gill-color'].replace(["u","k" ,"e" ,'y',"o","r"] , "o")


In [26]:
df['gill-color'].value_counts()

b     1728
p     1492
gh    1484
w     1202
o     1170
n     1048
Name: gill-color, dtype: int64

#### 7. Stalk

In [27]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [28]:
df['stalk-shape'].value_counts()

t    4608
e    3516
Name: stalk-shape, dtype: int64

In [29]:
df['stalk-root'].value_counts()

b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64

In [30]:
df['stalk-root'] = df['stalk-root'].replace(['e','c','r','?'],'o')

In [31]:
df['stalk-root'].value_counts()

o    4348
b    3776
Name: stalk-root, dtype: int64

In [32]:
df['stalk-surface-above-ring'].value_counts()

s    5176
k    2372
f     552
y      24
Name: stalk-surface-above-ring, dtype: int64

In [33]:
df['stalk-surface-above-ring'] = df['stalk-surface-above-ring'].replace(['k','f','y'],'o')

In [34]:
df['stalk-surface-above-ring'].value_counts()

s    5176
o    2948
Name: stalk-surface-above-ring, dtype: int64

In [35]:
df['stalk-surface-below-ring'].value_counts()

s    4936
k    2304
f     600
y     284
Name: stalk-surface-below-ring, dtype: int64

In [36]:
df['stalk-surface-below-ring'] = df['stalk-surface-below-ring'].replace(['k','f','y'],'o')

In [37]:
df['stalk-surface-below-ring'].value_counts()

s    4936
o    3188
Name: stalk-surface-below-ring, dtype: int64

In [38]:
stalk_color1 = df['stalk-color-above-ring'].value_counts()
stalk_color1

w    4464
p    1872
g     576
n     448
b     432
o     192
e      96
c      36
y       8
Name: stalk-color-above-ring, dtype: int64

In [39]:
stalk_color1_index = stalk_color1[stalk_color1<1000].index
df['stalk-color-above-ring'] = df['stalk-color-above-ring'].replace(stalk_color1_index , "o")

In [40]:
df['stalk-color-above-ring'].value_counts()

w    4464
p    1872
o    1788
Name: stalk-color-above-ring, dtype: int64

In [41]:
stalk_color2=df['stalk-color-below-ring'].value_counts()
stalk_color2

w    4384
p    1872
g     576
n     512
b     432
o     192
e      96
c      36
y      24
Name: stalk-color-below-ring, dtype: int64

In [42]:
stalk_color2_index = stalk_color2[stalk_color2<1000].index
df['stalk-color-below-ring'] = df['stalk-color-below-ring'].replace(stalk_color2_index , "o")

In [43]:
df['stalk-color-below-ring'].value_counts()

w    4384
p    1872
o    1868
Name: stalk-color-below-ring, dtype: int64

#### 8. Veil

In [44]:
df['veil-type'].value_counts()

p    8124
Name: veil-type, dtype: int64

In [45]:
df['veil-color'].value_counts()

w    7924
n      96
o      96
y       8
Name: veil-color, dtype: int64

In [46]:
## Dont neeed veil columns 

#### 9. Ring

In [47]:
df['ring-type'].value_counts()

p    3968
e    2776
l    1296
f      48
n      36
Name: ring-type, dtype: int64

In [48]:
df['ring-type'] = df['ring-type'].replace(["l","f","n"],"o")

In [49]:
df['ring-type'].value_counts()

p    3968
e    2776
o    1380
Name: ring-type, dtype: int64

In [50]:
df['ring-number'].value_counts()

o    7488
t     600
n      36
Name: ring-number, dtype: int64

In [51]:
# Dont want ring number

#### 10. Spore-print , population , habitat

In [52]:
spore = df['spore-print-color'].value_counts()

In [53]:
spore_index = spore[spore<1700].index
df['spore-print-color'] = df['spore-print-color'].replace(spore_index , "o")

In [54]:
df['spore-print-color'].value_counts()

w    2388
n    1968
o    1896
k    1872
Name: spore-print-color, dtype: int64

In [55]:
pop=df['population'].value_counts()

In [56]:
pop_index = pop[pop<2000].index
df['population']=df['population'].replace(pop_index,"o")

In [57]:
df['population'].value_counts()

o    4084
v    4040
Name: population, dtype: int64

In [58]:
habitat=df['habitat'].value_counts()

In [59]:
habitat_index = habitat[habitat<1000].index
df['habitat']=df['habitat'].replace(habitat_index,"o")

In [60]:
df['habitat'].value_counts()

d    3148
g    2148
o    1684
p    1144
Name: habitat, dtype: int64

In [61]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,o,f,c,n,o,...,s,w,w,p,w,o,p,k,o,o
1,e,x,s,y,t,o,f,c,b,o,...,s,w,w,p,w,o,p,n,o,g
2,e,o,s,w,t,o,f,c,b,n,...,s,w,w,p,w,o,p,n,o,o
3,p,x,y,w,t,o,f,c,n,n,...,s,w,w,p,w,o,p,k,o,o
4,e,x,s,g,f,n,f,w,b,o,...,s,w,w,p,w,o,e,n,o,g


In [62]:
df.to_csv("final_data.csv",index=False)