## Data Processing
We will be performing the below steps:
1. Handling Missing Values
2. Imbalance data
3. Handle Categorical Variables: remove rare labels
4. Standardize the values of the variables to the same range
5. Correlation - feature selection

#### import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from warnings import filterwarnings
filterwarnings("ignore")

#### Loading dataset from mongodb

In [2]:
import pymongo
client = pymongo.MongoClient("mongodb+srv://mongodb:mongodb@sreeman."
                             "jzldx.mongodb.net/myFirstDatabase?"
                             "retryWrites=true&w=majority",tls=True,
                             tlsAllowInvalidCertificates=True)
db_test = client.test
print(db_test)

Database(MongoClient(host=['sreeman-shard-00-00.jzldx.mongodb.net:27017', 'sreeman-shard-00-01.jzldx.mongodb.net:27017', 'sreeman-shard-00-02.jzldx.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-6iit9f-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'test')


In [3]:
client.list_database_names()

['Employee', 'Forest_Fire', 'admin', 'local']

In [4]:
db = client.get_database("Forest_Fire")
db.list_collection_names()

['Clean_data_forest_fire', 'Raw_data_forest_fire']

In [5]:
records = db.get_collection("Clean_data_forest_fire")
list_records = list(records.find())
df = pd.DataFrame(list_records)
df.drop("_id",axis=1,inplace=True)
df.head()

Unnamed: 0,day,month,year,temperature,rh,ws,rain,FFMC,DMC,DC,ISI,BUI,FWI,classes,region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,Bejaia
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,Bejaia
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,Bejaia
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,Bejaia
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,Bejaia


In [6]:
# seggregate the features:
date_feat = ["day","month","year"]
num_feat = [feature for feature in df.columns if df[feature].dtypes!='O' 
            and feature not in date_feat]
cat_feat = ["classes","region"]

#### 1. Handling Missing Values

In [7]:
missing_feat = ["classes","FWI"]

In [8]:
df.shape

(244, 15)

In [9]:
df.dropna(subset = missing_feat,inplace=True)

In [10]:
df.shape

(243, 15)

In [11]:
# find missing values
for feat,val in df.isnull().sum().to_dict().items():
    if val:
        print(feat,val)

In [12]:
df.isnull().sum()

day            0
month          0
year           0
temperature    0
rh             0
ws             0
rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
classes        0
region         0
dtype: int64

#### 2. Imbalance data

In [13]:
# % of each class in output variable:
((df["classes"].value_counts())/df.shape[0])*100

fire        56.378601
not fire    43.621399
Name: classes, dtype: float64