In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
df_train = pd.read_csv('train.csv')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
for i in df_train.columns:
    print("The number of unique values in {} is {}".format(i, len(df_train[i].unique())))

In [None]:
for i in df_test.columns:
    print("The number of unique values in {} is {}".format(i, len(df_test[i].unique())))

In [None]:
train = df_train.drop(["Lead_name","Contact_no","POC_name","Lead_POC_email","Date_of_creation"], axis=1)
test = df_test.drop(["Lead_name","Contact_no","POC_name","Lead_POC_email","Date_of_creation"], axis=1)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train=train.replace('\$','',regex=True)

In [None]:
test = test.replace('\$', '', regex=True)

In [None]:
train = train.set_index('Deal_title')

In [None]:
train.head()

In [None]:
test = test.set_index('Deal_title')

In [None]:
test.head()

In [None]:
train.info()

In [None]:
train['Weighted_amount'] = pd.to_numeric(train['Weighted_amount'], errors='coerce')
train['Deal_value'] = pd.to_numeric(train['Deal_value'], errors='coerce')
test['Weighted_amount'] = pd.to_numeric(test['Weighted_amount'], errors='coerce')
test['Deal_value'] = pd.to_numeric(test['Deal_value'], errors='coerce')

In [None]:
train["Industry"].fillna(train["Industry"].mode()[0], inplace = True)
test["Industry"].fillna(test["Industry"].mode()[0], inplace = True)

In [None]:
train["Last_lead_update"].fillna(train["Last_lead_update"].mode()[0], inplace = True)
test["Last_lead_update"].fillna(test["Last_lead_update"].mode()[0], inplace = True)

In [None]:
train["Resource"].fillna(train["Resource"].mode()[0], inplace = True)
test["Resource"].fillna(test["Resource"].mode()[0], inplace = True)

In [None]:
train['Deal_value'].fillna(train['Deal_value'].median(), inplace=True)
test['Deal_value'].fillna(test['Deal_value'].median(), inplace=True)

In [None]:
train['Weighted_amount'].fillna(train['Weighted_amount'].median(), inplace=True)
test['Weighted_amount'].fillna(test['Weighted_amount'].median(), inplace=True)

In [None]:
train['Geography'].value_counts(dropna=False)

In [None]:
train[train['Geography'].isnull()]

# we can understand from the above data is that the area from USA has a , in between them but areas from India doesn't have them. So, we can fillup the Gerography with USA where the area has a , and the one which doesn't have a , as India,

In [None]:
train[['Place', 'State']] = train['Location'].str.split(' ', 1, expand=True)
train['State'].fillna('0', inplace = True)
train['Geography'].fillna('USA', inplace = True)
train.loc[train['State'] == '0', 'Geography'] = 'India'
train.drop(['Place', 'State'], axis = 1, inplace = True)

In [None]:
test[['Place', 'State']] = train['Location'].str.split(' ', 1, expand=True)
test['State'].fillna('0', inplace = True)
test['Geography'].fillna('USA', inplace = True)
test.loc[test['State'] == '0', 'Geography'] = 'India'
test.drop(['Place', 'State'], axis = 1, inplace = True)

In [None]:
train[train['Location'].isnull()]

In [None]:
train.drop(train.loc[train['Success_probability']>100].index, inplace=True)

In [None]:
train.nunique()

In [None]:
# Get list of categorical variables
s = (train.dtypes == 'object')
object_cols = list(s[s].index)
print(object_cols)

In [None]:
for i in object_cols :
    train[i] = train[i].astype('category')
    test[i] = test[i].astype('category')
#coverting the categorical columns to numeric
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    train[col] = label_encoder.fit_transform(train[col].astype(str))
    test[col] = label_encoder.fit_transform(test[col].astype(str))

In [None]:
train.head()

In [None]:
y = train.pop('Success_probability')
X = train

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(n_estimators=250)
rfr.fit(X_train, y_train)

In [None]:
predictions = rfr.predict(X_test)

In [None]:
X_train

In [None]:
score=max(0,100-np.sqrt(metrics.mean_squared_error(y_test,predictions)))
print(score)

In [None]:
final = rfr.predict(test)
test['Success_probability']= final
test = test.drop(['Industry','Deal_value','Weighted_amount','Pitch','Lead_revenue','Fund_category','Geography','Location','Designation','Hiring_candidate_role','Lead_source','Level_of_meeting','Last_lead_update','Internal_POC','Resource','Internal_rating'], axis=1)
test.to_csv('output.csv')