# Gender Neutrality and Inclusion

## Reading the data


In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder


from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
train=pd.read_csv("../input/gender-neutrality-and-inclusion/Train.csv")
train.head()

In [None]:
train.shape

In [None]:
test=pd.read_csv("../input/gender-neutrality-and-inclusion/Test.csv")
test.head()

In [None]:
test.shape

# Exploratory data analysis

### Check for null values

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

There are null values in BiasInfluentialFactor in training dataset 
Rest all the data is clean

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

## Pairplot

In [None]:
sns.pairplot(train,hue="BiasInfluentialFactor")

In [None]:
train.describe()

## Check Correlation

In [None]:
train.corr()

In [None]:
# using .heatmap() of seaborn to understand better relationship of variables 
sns.heatmap(train.corr(), annot=True,cbar=False)

From the Heatmap it is clear that Graduation year and age are negatively correlated And Graduatuon year has a negative correlation with Years of experience

Another think to notice is that Expected CTC and Current CTC are positively correlated 

Age is also positively correlated with years of experience 

We can drop columns Graduation year, Current CTC and Years of Experience from our data 

# Part 1 - Fitment regression

# let's crate a dataframe with required columns

In [None]:
# dropping the EmpId and EmpName as well as they are not required
df=train.drop(['GraduationYear','EmpID', 'EmpName','CurrentCTC','BiasInfluentialFactor','YearsOfExperince'],axis=1)

# checking the first five rows
df.head()

## Label encoding on data with 'object' datatye

In [None]:
# Encode each object type with a label using one hot encoding

# loop over each column
for f in df.columns: 
    # check for object type
    if df[f].dtype=='object':
      #label encoder 
        lbl = LabelEncoder() 
        lbl.fit(list(df[f].values)) 
        df[f] = lbl.transform(list(df[f].values))

# check the dataframe after label encoding
df.head()

In [None]:
# distribute our data to X and Y
X=df.drop(['FitmentPercent'],axis=1)
Y=df["FitmentPercent"]

In [None]:
# Find the correlation (last check)
X.corr()

In [None]:
# split the data to train and test set
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.85,random_state=42)


print("training data shape:-{} labels{} ".format(x_train.shape,y_train.shape))
print("testing data shape:-{} labels{} ".format(x_test.shape,y_test.shape))

In [None]:
# train Random forest regressor

# create the model
model_rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)

print("Training..........")
# fitting the model
model_rf.fit(x_train, y_train) 

# get original predictions
pred_train_rf= model_rf.predict(x_train)

print("Training Evaluation")
print("Mean Square Error",np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print("R2 Score",r2_score(y_train, pred_train_rf))

print("Testing Evaluation")
# testing predictions
pred_test_rf = model_rf.predict(x_test)
print("Mean Square Error",np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print("R2 Score",r2_score(y_test, pred_test_rf))

## Test dataset for regression testing

In [None]:
# get the dataset ready removing unwanted columns
df_test=test.drop(['GraduationYear','EmpID', 'EmpName','CurrentCTC','YearsOfExperince'],axis=1)

In [None]:
# Encoding each column og object datatype
for f in df_test.columns: 
    if df_test[f].dtype=='object': 
        lbl = LabelEncoder() 
        lbl.fit(list(df_test[f].values)) 
        df_test[f] = lbl.transform(list(df_test[f].values))

# scaling down test data
scaler = StandardScaler()
df_test = scaler.fit_transform(df_test)
print(df_test[0])

In [None]:
# get the predictions and store them as list
val=list(model_rf.predict(df_test))


In [None]:
import matplotlib.pyplot as plt
plt.title("Graph showing training accuracy")
plt.plot(sorted(y_train),label="original value")
plt.plot(sorted(model_rf.predict(x_train)),label="Model Predictions")
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.title("Graph showing testing accuracy")
plt.plot(sorted(y_test),label="original value")
plt.plot(sorted(model_rf.predict(x_test)),label="Model Predictions")
plt.legend()
plt.show()

# Part 2-Bias classification

## This section of the notebook deals with building the bias influential factor classifier

## There are NaN values in the Bias Influential Factor column. But I had some confusion regarding it's relevance, thus I am considering those NaN values as labels. I have started a discussion regarding the same, if some changes are required I'll make them in the notebook after the discussion.

## Get the dataset ready

In [None]:
# create the dataset of our requred columns
df=train.drop(['GraduationYear','EmpID', 'EmpName','CurrentCTC','FitmentPercent','YearsOfExperince'],axis=1)
df.head()

In [None]:
# encoding the object datatype values in data
le =LabelEncoder()
df['BiasInfluentialFactor']=le.fit_transform(df['BiasInfluentialFactor'])

In [None]:
df.isnull().sum()

In [None]:
def assign_labels(df,column):
  val=df[column].unique().tolist()
  mydict={}
  c=0
  for i in val:
    mydict[i]=c
    c+=1
  print(mydict)
  return mydict

for i in df.columns:
  if df[i].dtype=='object':
    df[i]=df[i].map(assign_labels(df,i))
    #print(assign_labels(df,i))
df.head()

In [None]:
# creating X and Y
X=df.drop(['BiasInfluentialFactor'],axis=1)
Y=df['BiasInfluentialFactor']

In [None]:
# scaling down the data using Standatd scaler for much accuracy
scaler = StandardScaler()
X = scaler.fit_transform(X)
print(X[0])

In [None]:
# split the data to train and test set
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.85,random_state=42)


print("training data shape:-{} labels{} ".format(x_train.shape,y_train.shape))
print("testing data shape:-{} labels{} ".format(x_test.shape,y_test.shape))

In [None]:
# build a decision tree classifier
clf = DecisionTreeClassifier().fit(x_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(x_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(x_test, y_test)))

## Make the predictions

In [None]:
# Make Predictions
pred=clf.predict(np.array(df_test))

In [None]:
# Store Predictions
sol=list(le.inverse_transform(pred))


In [None]:
from sklearn.metrics import confusion_matrix
# Make Predictions
pred1=clf.predict(np.array(x_train))
plt.title("Confusion Matrix training data")
sns.heatmap(confusion_matrix(y_train,pred1),annot=True,cbar=False)
plt.legend()

In [None]:
from sklearn.metrics import confusion_matrix
# Make Predictions
pred1=clf.predict(np.array(x_test))
plt.title("Confusion Matrix testing data")
sns.heatmap(confusion_matrix(y_test,pred1),annot=True,cbar=False)
plt.legend()

# Saving the score

## Let's create a CSV file to store the model predictions in the desired format

In [None]:
# storing the employee ID as list
Emp=test['EmpID'].to_list()

In [None]:

# intialise data of lists.
data = {'EmpID':Emp,
        'BiasInfluentialFactor':sol,
        'FitmentPercent':val}
  
# Create DataFrame
df = pd.DataFrame(data)
  
# Print the output.
df.head()

In [None]:
# saving the dataframe as CSV file
df.to_csv('Sol_Pulkit.csv',index=False)

# Thanks!!