In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/hackerearths-reduce-marketing-waste/train.csv')
test=pd.read_csv('/kaggle/input/hackerearths-reduce-marketing-waste/test.csv')
train.head()

In [None]:
test.head()

# ****Analysis of Training Data****

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
#Probability cannot be more than 100.So, we will be dropping rows having values greater than 100
train.drop(train.loc[train['Success_probability']>100].index, inplace=True)

In [None]:
train.columns

In [None]:
for i in train.columns:
    print(i,"-->",train[i].nunique())

In [None]:
for i in test.columns:
    print(i,"-->",test[i].nunique())

# ****EDA(Exploratory Data Analysis)****

### Converting Deal_value and Weighted_amount columns to float type by deleting '$' character

In [None]:
train['Deal_value']=train['Deal_value'].astype('str')
train['Weighted_amount']=train['Weighted_amount'].astype('str')

train['Deal_value']=train['Deal_value'].map(lambda x:str(x).split('$')[0])
train['Weighted_amount']=train['Weighted_amount'].map(lambda x:str(x).split('$')[0])
    
    
train['Deal_value']=train['Deal_value'].astype('float64')
train['Weighted_amount']=train['Weighted_amount'].astype('float64')


In [None]:
test['Deal_value']=test['Deal_value'].astype('str')
test['Weighted_amount']=test['Weighted_amount'].astype('str')

test['Deal_value']=test['Deal_value'].map(lambda x:str(x).split('$')[0])
test['Weighted_amount']=test['Weighted_amount'].map(lambda x:str(x).split('$')[0])
    
test['Deal_value']=test['Deal_value'].astype('float64')
test['Weighted_amount']=test['Weighted_amount'].astype('float64')


### Extracting year from Date_of_creation column

In [None]:
train['Date_of_creation']=train['Date_of_creation'].astype('str')

train['Date_of_creation']=train['Date_of_creation'].map(lambda x:x.split('-')[0])

train['Date_of_creation']=train['Date_of_creation'].astype('object')
train.rename({'Date_of_creation':'Year'},axis=1,inplace=True)

In [None]:
test['Date_of_creation']=test['Date_of_creation'].astype('str')

test['Date_of_creation']=test['Date_of_creation'].map(lambda x:x.split('-')[0])

test['Date_of_creation']=test['Date_of_creation'].astype('object')
test.rename({'Date_of_creation':'Year'},axis=1,inplace=True)

### Extracting Country code from location

In [None]:
train['Location']=train['Location'].fillna('IND').map(lambda x:str(x).split(',')[1].rstrip() if len(str(x).split(','))>1 else 'IND')
test['Location']=test['Location'].fillna('IND').map(lambda x:str(x).split(',')[1].rstrip() if len(str(x).split(','))>1 else 'IND')

## ****Handling Missing Values****

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train['Deal_value'].fillna(train['Deal_value'].median(),inplace=True)
train['Weighted_amount'].fillna(train['Weighted_amount'].median(),inplace=True)

test['Deal_value'].fillna(test['Deal_value'].median(),inplace=True)
test['Weighted_amount'].fillna(test['Weighted_amount'].median(),inplace=True)

In [None]:
#Industry column has only 1 Nan value so filling it with most appeared category
train['Industry'].fillna('Banks',inplace=True)
test['Industry'].fillna('Banks',inplace=True)

In [None]:
train['Geography'].fillna(train['Geography'].mode()[0],inplace=True)
test['Geography'].fillna(test['Geography'].mode()[0],inplace=True)
train['Geography'].value_counts()

In [None]:
train['Last_lead_update'].fillna(train['Last_lead_update'].mode()[0],inplace=True)
test['Last_lead_update'].fillna(test['Last_lead_update'].mode()[0],inplace=True)
train['Last_lead_update'].value_counts()

In [None]:
train['Resource'].fillna(train['Resource'].mode()[0],inplace=True)
train.loc[0,'Resource']='No'
test['Resource'].fillna(test['Resource'].mode()[0],inplace=True)
train['Resource'].value_counts()

## ****Handling Categorical Variables****

In [None]:
train.drop(['Deal_title','Lead_name','Contact_no','POC_name','Lead_POC_email'],axis=1,inplace=True)
test.drop(['Deal_title','Lead_name','Contact_no','POC_name','Lead_POC_email'],axis=1,inplace=True)

In [None]:
s=(train.dtypes=='object')
categorical_features=list(s[s].index)
print("Categorical Features in the Dataset are:")
print("")
print(categorical_features)

In [None]:
for i in categorical_features:
    train[i]=train[i].astype('category')
    test[i]=test[i].astype('category')

In [None]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
for i in categorical_features:
    train[i]=lb.fit_transform(train[i])
    test[i]=lb.fit_transform(test[i])

# ****Visualising our Data using Histograms and Scatter Plots****

In [None]:
for feature in train.columns:
    data=train.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.title(feature)
    plt.show()

In [None]:
for feature in train.columns:
    data=train.copy()
    
    data[feature]=np.log(data[feature])
    data['Success_probability']=np.log(data['Success_probability'])
    plt.scatter(data[feature],data['Success_probability'])
    plt.xlabel(feature)
    plt.ylabel('Success_probability')
    plt.title(feature)
    plt.show()

## ****Looking for Outliers using Box Plots****

In [None]:
import matplotlib.pyplot as plt
for feature in train.columns:
    data=train.copy()
    data[feature]=np.log(data[feature])
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

In [None]:
print("Train--> Internal rating")
print(train['Internal_rating'].value_counts())

print("Test--> Internal rating")
print(test['Internal_rating'].value_counts())

In [None]:
mask1=(test['Internal_rating']==-1.00)|(test['Internal_rating']==82.34)
test.loc[mask1,'Internal_rating']=4.00
test['Internal_rating']=test['Internal_rating'].astype('int64')

In [None]:
print("Train--> Location")
print(train['Location'].value_counts())

print("Test--> Location")
print(test['Location'].value_counts())

In [None]:
print("Train--> Industry")
print(train['Industry'].value_counts())

print("Test--> Industry")
print(test['Industry'].value_counts())

In [None]:
print("Train--> Hiring_candidate_role")
print(train['Hiring_candidate_role'].value_counts())

print("Test--> Hiring_candidate_role")
print(test['Hiring_candidate_role'].value_counts())

### ****The columns Hiring_candidate_role,Industry and Location have so many outlier so remove them from training set****

In [None]:
Y=train['Success_probability']
X=train.drop(['Success_probability','Hiring_candidate_role','Industry','Location'],axis=1)

# ****Training Model****

## ****Hyperparameter Tuning****

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn import metrics
param_test = {
 'max_depth':[3,4,5],
'n_estimators':[9,10,11]
}

gsearch1 = GridSearchCV(estimator = XGBRegressor(objective= "reg:linear",learning_rate=0.11), 
param_grid = param_test, scoring=metrics.mean_squared_error,n_jobs=-1,cv=3)
gsearch1.fit(X,Y)

In [None]:
gsearch1.best_params_

In [None]:
model=XGBRegressor( learning_rate = 0.11,
                max_depth =4,objective="reg:linear",alpha =1,n_estimators=9)
model.fit(X,Y)

In [None]:
test.drop(['Hiring_candidate_role','Industry','Location'],axis=1,inplace=True)
y_pred=model.predict(test)

In [None]:
t=pd.read_csv('../input/hackerearths-reduce-marketing-waste/test.csv')

In [None]:
submission=pd.DataFrame(t['Deal_title'])
submission['Success_probability']=y_pred
submission

In [None]:
submission.to_csv('Submission17.csv',index=False)