# N2O Emission Analysis using Machine Learning Model

Each country have N2O Emission from different Items and we are interested to classify the Emission values into different zones depending on all the features. 

#### Since N2O is more harmful, the Zones are split into 4 different Categories with 0 for Green, 1 for Yellow, 2 for Orange and 3 for Red. The countries in Zone 3 are in Red Zone and need to take immediate action to reduce the N20 Emission.

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Get and Clean Data

Element Codes: 7230, 7234, 7236

Item_Codes: 5061, 6516, 6992, 6995, 6996

Year: 2011 to 2019


As we have many small countries in our list with very less Emission (<5) and population (<5000), which are not impacting the world Emission. So, we are not considering those records. 

In [3]:
#Loading the Csv file from S3 Bucket
#noworld_population_df = pd.read_csv("https://dataanalyticsproject.s3.us-east-2.amazonaws.com/Merged_L5000.csv",index_col=[0]) 
noworld_population_df = pd.read_csv("Emission_Population_L5000_Data.csv") 
noworld_population_df

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1990,12412.308,178.4682
1,2,Afghanistan,5058,Enteric Fermentation,724413,Emissions (CO2eq) from CH4 (AR5),1990,12412.308,4997.1108
2,2,Afghanistan,5058,Enteric Fermentation,723113,Emissions (CO2eq) (AR5),1990,12412.308,4997.1108
3,2,Afghanistan,5059,Manure Management,7225,Emissions (CH4),1990,12412.308,8.5165
4,2,Afghanistan,5059,Manure Management,7230,Emissions (N2O),1990,12412.308,0.3046
...,...,...,...,...,...,...,...,...,...
844768,181,Zimbabwe,6516,Land Use change,7230,Emissions (N2O),2019,14645.468,0.0000
844769,181,Zimbabwe,6516,Land Use change,7273,Emissions (CO2),2019,14645.468,10662.4408
844770,181,Zimbabwe,6516,Land Use change,724413,Emissions (CO2eq) from CH4 (AR5),2019,14645.468,0.0000
844771,181,Zimbabwe,6516,Land Use change,724313,Emissions (CO2eq) from N2O (AR5),2019,14645.468,0.0000


In [4]:
#Finding the Statistical values of each feature
noworld_population_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Area_Code,728361.0,129.037646,75.436579,1.0,64.0,128.0,192.0,351.0
Item_Code,728361.0,9750.328847,16082.691077,1707.0,5061.0,6750.0,6994.0,69921.0
Element_Code,728361.0,354233.179775,358123.888115,7225.0,7230.0,7273.0,724313.0,724413.0
Year,728361.0,2004.643095,8.607385,1990.0,1997.0,2005.0,2012.0,2019.0
Population,728361.0,38135.675229,156133.177982,0.768,766.615,5716.161,20526.303,1465634.161
Emission,728361.0,4009.738416,43708.062051,-797183.079,0.0076,3.5152,216.6866,2171273.959


In [5]:
#Details of non-numeric features
noworld_population_df.describe(include=['object']).T

Unnamed: 0,count,unique,top,freq
Area,728361,240,Portugal,3570
Item,728361,25,Emissions on agricultural land,53082
Element,728361,8,Emissions (CO2eq) (AR5),146660


In [6]:
#Checking the null values
noworld_population_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 728361 entries, 0 to 844772
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Area_Code     728361 non-null  int64  
 1   Area          728361 non-null  object 
 2   Item_Code     728361 non-null  int64  
 3   Item          728361 non-null  object 
 4   Element_Code  728361 non-null  int64  
 5   Element       728361 non-null  object 
 6   Year          728361 non-null  int64  
 7   Population    728361 non-null  float64
 8   Emission      728361 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 55.6+ MB


In [7]:
#List of columns
noworld_population_df.columns

Index(['Area_Code', 'Area', 'Item_Code', 'Item', 'Element_Code', 'Element',
       'Year', 'Population', 'Emission'],
      dtype='object')

In [21]:
#As mentioned above, we are trying to extract the corresponding data from Emission dataset for Countries
emissions_N2O_new_df = noworld_population_df[noworld_population_df['Item_Code'].isin([5059, 5061, 5062, 5063, 5064, 5066, 6516, 67291, 6795, 6992, 6994]) & 
                                        (noworld_population_df['Element_Code'] == 7230) & (noworld_population_df['Year'] > 2010 ) &
                                         (noworld_population_df['Emission'] > 1) & (noworld_population_df['Population'] > 5000) ]


emissions_N2O_new_df

Unnamed: 0,Area_Code,Area,Item_Code,Item,Element_Code,Element,Year,Population,Emission
591095,2,Afghanistan,5062,Manure applied to Soils,7230,Emissions (N2O),2011,30117.413,1.6961
591100,2,Afghanistan,5063,Manure left on Pasture,7230,Emissions (N2O),2011,30117.413,10.7180
591105,2,Afghanistan,5064,Crop Residues,7230,Emissions (N2O),2011,30117.413,1.2643
591322,4,Algeria,5061,Synthetic Fertilizers,7230,Emissions (N2O),2011,36661.445,1.1722
591332,4,Algeria,5063,Manure left on Pasture,7230,Emissions (N2O),2011,36661.445,12.8453
...,...,...,...,...,...,...,...,...,...
844598,251,Zambia,6992,Forest fires,7230,Emissions (N2O),2019,17861.030,36.8186
844655,251,Zambia,6516,Land Use change,7230,Emissions (N2O),2019,17861.030,33.0285
844673,181,Zimbabwe,5061,Synthetic Fertilizers,7230,Emissions (N2O),2019,14645.468,1.3055
844683,181,Zimbabwe,5063,Manure left on Pasture,7230,Emissions (N2O),2019,14645.468,10.7007


In [22]:
#Dropping the unwanted columns 
emissions_N2O_new_df=emissions_N2O_new_df.drop(['Area','Item','Element', 'Element_Code'],axis=1)
emissions_N2O_new_df.head()

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission
591095,2,5062,2011,30117.413,1.6961
591100,2,5063,2011,30117.413,10.718
591105,2,5064,2011,30117.413,1.2643
591322,4,5061,2011,36661.445,1.1722
591332,4,5063,2011,36661.445,12.8453


# Categorizing data

Item_Code = 0 to 4

Element_Code = 0 to 2

Year = 0 to 9 (2011 through 2019)

Population = 0 to 4

Emission = 0 to 6

Zone 0 to 3 

In [23]:
#Catagorizing the Item_Code data
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 5059, "Item_Code"] = 0
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 5061, "Item_Code"] = 1
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 5062, "Item_Code"] = 2
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 5063, "Item_Code"] = 3
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 5064, "Item_Code"] = 4
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 5066, "Item_Code"] = 5

emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 6516, "Item_Code"] = 6
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 6795, "Item_Code"] = 7
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 6992, "Item_Code"] = 8
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 6994, "Item_Code"] = 9
emissions_N2O_new_df.loc[emissions_N2O_new_df["Item_Code"] == 67291, "Item_Code"] = 10

In [24]:
emissions_N2O_new_df.head()

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission
591095,2,2,2011,30117.413,1.6961
591100,2,3,2011,30117.413,10.718
591105,2,4,2011,30117.413,1.2643
591322,4,1,2011,36661.445,1.1722
591332,4,3,2011,36661.445,12.8453


In [25]:
#Catagorizing Year 2010 through 2019
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2010, "Year"] = 0
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2011, "Year"] = 1
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2012, "Year"] = 2

emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2013, "Year"] = 3
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2014, "Year"] = 4
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2015, "Year"] = 5

emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2016, "Year"] = 6
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2017, "Year"] = 7
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2018, "Year"] = 8
emissions_N2O_new_df.loc[emissions_N2O_new_df["Year"] == 2019, "Year"] = 9



emissions_N2O_new_df.head()

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission
591095,2,2,1,30117.413,1.6961
591100,2,3,1,30117.413,10.718
591105,2,4,1,30117.413,1.2643
591322,4,1,1,36661.445,1.1722
591332,4,3,1,36661.445,12.8453


In [26]:
#Catagorizing Population data into 5 categories

emissions_N2O_new_df.loc[emissions_N2O_new_df["Population"] <=10000, "Population"] = 0
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Population"] > 10000) & (emissions_N2O_new_df["Population"] <= 50000) , "Population"] = 1
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Population"] > 50000) & (emissions_N2O_new_df["Population"] <= 100000), "Population"] = 2
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Population"] > 100000) & (emissions_N2O_new_df["Population"] <= 1000000) , "Population"] = 3
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Population"] > 1000000) , "Population"] = 4     
emissions_N2O_new_df.head()


Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission
591095,2,2,1,1.0,1.6961
591100,2,3,1,1.0,10.718
591105,2,4,1,1.0,1.2643
591322,4,1,1,1.0,1.1722
591332,4,3,1,1.0,12.8453


In [27]:
#Creating Zone Variable
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 1) & (emissions_N2O_new_df["Emission"] <= 10) , "Zone"] = 0
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 10) & (emissions_N2O_new_df["Emission"] <= 25) , "Zone"] = 1
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 25) & (emissions_N2O_new_df["Emission"] <= 75) , "Zone"] = 2
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 75),"Zone"]= 3 

emissions_N2O_new_df.head() 

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission,Zone
591095,2,2,1,1.0,1.6961,0.0
591100,2,3,1,1.0,10.718,1.0
591105,2,4,1,1.0,1.2643,0.0
591322,4,1,1,1.0,1.1722,0.0
591332,4,3,1,1.0,12.8453,1.0


In [28]:
#Catagorizing Emission values into 7 different categories

emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 1) & (emissions_N2O_new_df["Emission"] <= 10) , "Emission"] = 0
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 10) & (emissions_N2O_new_df["Emission"] <= 15) , "Emission"] = 1
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 15) & (emissions_N2O_new_df["Emission"] <= 20) , "Emission"] = 2
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 20) & (emissions_N2O_new_df["Emission"] <= 30) , "Emission"] =3
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 30) & (emissions_N2O_new_df["Emission"] <= 50) , "Emission"] =4
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 50) & (emissions_N2O_new_df["Emission"] <= 100) , "Emission"] =5
emissions_N2O_new_df.loc[(emissions_N2O_new_df["Emission"] > 100),"Emission"]=6

emissions_N2O_new_df.head()                      

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission,Zone
591095,2,2,1,1.0,0.0,0.0
591100,2,3,1,1.0,1.0,1.0
591105,2,4,1,1.0,0.0,0.0
591322,4,1,1,1.0,0.0,0.0
591332,4,3,1,1.0,1.0,1.0


In [29]:
emissions_N2O_new_df["Population"] = emissions_N2O_new_df["Population"].astype(int)
emissions_N2O_new_df["Emission"] = emissions_N2O_new_df["Emission"].astype(int)
emissions_N2O_new_df["Zone"] = emissions_N2O_new_df["Zone"].astype(int)

In [30]:
emissions_N2O_new_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Area_Code,4755.0,135.908307,83.006348,2.0,58.0,131.0,211.0,351.0
Item_Code,4755.0,3.753523,2.999759,0.0,1.0,3.0,6.0,10.0
Year,4755.0,5.016614,2.583306,1.0,3.0,5.0,7.0,9.0
Population,4755.0,1.463512,1.069957,0.0,1.0,1.0,2.0,4.0
Emission,4755.0,0.856362,1.636825,0.0,0.0,0.0,1.0,6.0
Zone,4755.0,0.455521,0.831053,0.0,0.0,0.0,1.0,3.0


In [31]:
emissions_N2O_new_df.tail(20) 

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission,Zone
844067,236,2,9,1,0,0
844072,236,3,9,1,3,2
844096,236,7,9,1,0,0
844106,236,8,9,1,0,0
844172,237,0,9,2,0,0
844181,237,1,9,2,4,2
844186,237,2,9,2,0,0
844191,237,3,9,2,1,1
844196,237,4,9,2,1,1
844207,237,10,9,2,0,0


In [32]:
emissions_N2O_new_df.nunique()

Area_Code     120
Item_Code      11
Year            9
Population      5
Emission        7
Zone            4
dtype: int64

In [33]:
emissions_N2O_new_df["Emission"].value_counts()

0    3424
1     328
3     243
2     242
4     220
6     161
5     137
Name: Emission, dtype: int64

In [34]:
emissions_N2O_new_df["Zone"].value_counts()

0    3424
1     713
2     401
3     217
Name: Zone, dtype: int64

In [35]:
emissions_N2O_new_df["Population"].value_counts()

1    2258
2     833
0     730
3     701
4     233
Name: Population, dtype: int64

## Machine Learning

Data cleaning and classification parts are done for the input features.

Machine learning methods that predict the future Emission depends on many factors like soil temperature,air moisture,Volumetric Water Content(VWC). So, we end up with Classification algorithms which will help us identifying the Emission values into different Zones for each Elements (N2O, CH4, CO2). 

As we have **Imbalanced Emission values** depending on the Country size and population we just can't classify the Zones into Binary vlaues. **Multiclass classification** is the problem of classifying instances into one of three or more classes.

#### Popular algorithms that can be used for multi-class classification include:

Logistic regression

Decision Trees

Random Forest

Naive Bayes

k-Nearest Neighbors

Gradient Boosting


**Logistic regression** is a simple yet very effective classification algorithm. Multinomial logistic regression is an extension of logistic regression that adds native support for multi-class classification problems. So, we are starting with this algorithm. 

In [37]:
emissions_N2O_new_df.reset_index(inplace=True, drop=True)

In [38]:
emissions_N2O_new_df

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission,Zone
0,2,2,1,1,0,0
1,2,3,1,1,1,1
2,2,4,1,1,0,0
3,4,1,1,1,0,0
4,4,3,1,1,1,1
...,...,...,...,...,...,...
4750,251,8,9,1,4,2
4751,251,6,9,1,4,2
4752,181,1,9,1,0,0
4753,181,3,9,1,1,1


In [40]:
# Segment the features from the target
X = emissions_N2O_new_df[["Item_Code", "Year", "Population", "Emission"]]
y = emissions_N2O_new_df[["Zone"]]
#y = emissions_N2O_new_df["Emission"].ravel()

In [41]:
X

Unnamed: 0,Item_Code,Year,Population,Emission
0,2,1,1,0
1,3,1,1,1
2,4,1,1,0
3,1,1,1,0
4,3,1,1,1
...,...,...,...,...
4750,8,9,1,4
4751,6,9,1,4
4752,1,9,1,0
4753,3,9,1,1


In [42]:
#y = y.ravel()
y

Unnamed: 0,Zone
0,0
1,1
2,0
3,0
4,1
...,...
4750,2
4751,2
4752,0
4753,1


In [43]:
emissions_N2O_new_df

Unnamed: 0,Area_Code,Item_Code,Year,Population,Emission,Zone
0,2,2,1,1,0,0
1,2,3,1,1,1,1
2,2,4,1,1,0,0
3,4,1,1,1,0,0
4,4,3,1,1,1,1
...,...,...,...,...,...,...
4750,251,8,9,1,4,2
4751,251,6,9,1,4,2
4752,181,1,9,1,0,0
4753,181,3,9,1,1,1


In [44]:
np.shape(X)

(4755, 4)

In [45]:
np.shape(y)

(4755, 1)

In [46]:
test_sizes = 0.20
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_sizes, random_state=seed, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3804, 4)
(951, 4)
(3804, 1)
(951, 1)


In [47]:
#model = LogisticRegression()
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [48]:
a = model.fit(X_train, Y_train)
a

LogisticRegression(multi_class='multinomial')

In [49]:
predictions = model.predict(X_test)

In [50]:
predictions

array([2, 0, 0, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 2, 0,
       0, 0, 2, 2, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 3, 0, 1, 1, 0, 0, 2, 0, 0, 1, 0, 1, 0, 0,
       0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 3,
       0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 2, 3, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 3, 0, 2, 0, 1, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       3, 0, 0, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 2,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 0, 0, 2,
       0, 0, 0, 0, 0, 3, 1, 0, 2, 0, 0, 0, 1, 0, 0,

In [51]:
X_train

Unnamed: 0,Item_Code,Year,Population,Emission
283,3,1,1,3
4142,6,8,2,0
2196,4,5,1,1
1776,3,4,1,1
920,2,2,0,0
...,...,...,...,...
334,2,1,3,1
800,4,2,1,0
4517,7,9,1,0
1551,0,3,1,0


In [52]:
X_test
print(a.score(X_test, Y_test))

0.9800210304942166


In [53]:
Y_test

Unnamed: 0,Zone
4257,2
120,0
4013,0
3129,0
4446,1
...,...
462,0
3829,0
1384,0
1170,3


In [54]:
predictions

array([2, 0, 0, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 2, 0,
       0, 0, 2, 2, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 3, 0, 1, 1, 0, 0, 2, 0, 0, 1, 0, 1, 0, 0,
       0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 3,
       0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 2, 3, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 3, 0, 2, 0, 1, 1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       3, 0, 0, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 2,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 0, 0, 2,
       0, 0, 0, 0, 0, 3, 1, 0, 2, 0, 0, 0, 1, 0, 0,

In [55]:
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       685
           1       0.92      0.99      0.95       143
           2       0.92      0.84      0.88        80
           3       1.00      0.91      0.95        43

    accuracy                           0.98       951
   macro avg       0.96      0.93      0.94       951
weighted avg       0.98      0.98      0.98       951



In [56]:
confusion_matrix(Y_test, predictions)

array([[685,   0,   0,   0],
       [  0, 141,   2,   0],
       [  0,  13,  67,   0],
       [  0,   0,   4,  39]], dtype=int64)

## Random forest classifier

A random forest classifier works with data having discrete labels or better known as class. 

#### Advantages of Random Forest

It reduces overfitting in decision trees and helps to improve the accuracy

It is flexible to both classification and regression problems

It works well with both categorical and continuous values

It automates missing values present in the data

Normalising of data is not required as it uses a rule-based approach.


### Emission Zone ( 0 , 1, 2, 3, 4)

In [57]:
emissions_N2O_array = np.asarray(emissions_N2O_new_df)
emissions_N2O_array

array([[  2,   2,   1,   1,   0,   0],
       [  2,   3,   1,   1,   1,   1],
       [  2,   4,   1,   1,   0,   0],
       ...,
       [181,   1,   9,   1,   0,   0],
       [181,   3,   9,   1,   1,   1],
       [181,   7,   9,   1,   0,   0]], dtype=int64)

In [59]:
X = emissions_N2O_array[:,1:5]
X

array([[2, 1, 1, 0],
       [3, 1, 1, 1],
       [4, 1, 1, 0],
       ...,
       [1, 9, 1, 0],
       [3, 9, 1, 1],
       [7, 9, 1, 0]], dtype=int64)

In [60]:
y = emissions_N2O_array[:,5:6]
y

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]], dtype=int64)

In [61]:
np.shape(X)

(4755, 4)

In [62]:
np.shape(y)

(4755, 1)

In [63]:
y

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]], dtype=int64)

In [64]:
test_sizes = 0.20
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = test_sizes, random_state =seed,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3804, 4)
(951, 4)
(3804, 1)
(951, 1)


In [65]:
Emission_model = RandomForestClassifier(max_depth=2, random_state=0)

In [66]:
Emission_model_fit = Emission_model.fit(X_train, Y_train)

In [67]:
model_prediction = Emission_model_fit.predict(X_test)

In [68]:
Emission_model_fit.score(X_test, Y_test)

0.917981072555205

In [69]:
Y_test

array([[2],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [0],
       [0],
       [2],
       [2],
       [0],
       [0],
       [2],
       [3],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [2],
       [3],
       [0],
       [1],
       [1],
       [0],
       [0],
       [2],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [2],
       [0],
       [2],
       [0],
       [0],
       [0],
       [0],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
    

In [70]:
model_prediction

array([2, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 3,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 2, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       3, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 1, 0, 0,

In [71]:
confusion_matrix = confusion_matrix(Y_test, model_prediction)
print(confusion_matrix)

[[685   0   0   0]
 [  0 143   0   0]
 [  0  59  20   1]
 [  0  10   8  25]]


In [72]:
print(classification_report(Y_test, model_prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       685
           1       0.67      1.00      0.81       143
           2       0.71      0.25      0.37        80
           3       0.96      0.58      0.72        43

    accuracy                           0.92       951
   macro avg       0.84      0.71      0.73       951
weighted avg       0.93      0.92      0.91       951



##  Trying Scalar and n-estimators

In [73]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [74]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [75]:
X_train_scaled

array([[ 1.4309906 , -0.41133069,  1.43340026, -0.52244416],
       [-0.58358203, -1.18803454,  0.50064381, -0.52244416],
       [-1.25510624,  1.14207702,  1.43340026,  0.69841814],
       ...,
       [ 0.42370428,  0.36537317,  2.36615671, -0.52244416],
       [-0.91934414,  1.14207702,  0.50064381,  1.91928043],
       [ 1.7667527 , -1.57638646, -1.36486908, -0.52244416]])

In [76]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [77]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [78]:
# Making predictions using the testing data.
predictions_s = rf_model.predict(X_test_scaled)

In [79]:
predictions_s

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [80]:
y_test

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [81]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions_s)

In [82]:
acc_score

0.9739276703111859