In [1]:
# installed packages
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag.perceptron import PerceptronTagger
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
import warnings
import numpy as np
# silencing the warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
'''
!pip install nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

!pip install numba
'''

# created packages
import nlp_ops as nlp

In [237]:
def read_data():
    '''
    reads the train and test data
    '''
    train_data = pd.read_csv("Train.csv")
    test_data = pd.read_excel("Test.xlsx")
    return train_data, test_data

In [238]:
def remove_city(row, city_str):
    '''
    This function cleans the city present in job desig column
    '''
    row_list =  [each_word.lower().strip('.').strip().strip('.') for each_word in row.split()]
    for elem in row_list:
        if elem in city_str:
            row_list.remove(elem)
    return ' '.join(row_list)

In [239]:
def clean_data(stitched_data):
    '''
    this function does data cleaning
    '''
    print("Dropping job_type column")
    stitched_data.drop(columns = ['job_type'], inplace = True)
    # Fill missing value, replace '...' from text columns, cleaning location columns
    for col in stitched_data.columns:
        if stitched_data.dtypes[col] == 'object':
            stitched_data[col] = stitched_data[col].str.lower()
            stitched_data[col] = stitched_data[col].str.replace('\(.*\)', '')
            stitched_data[col] = stitched_data[col].str.replace('\.\.\.', '')
            stitched_data[col] = stitched_data[col].fillna(stitched_data[col].mode()[0])
        elif stitched_data.dtypes[col] == 'float64':
            stitched_data[col] = stitched_data[col].fillna(stitched_data[col].mean())

    # remove city name from columns 'job_description', 'job_desig'
    city_name = pd.read_csv('india_city_list.csv')
    city_name = city_name.India_city_list.str.lower().str.strip().replace('.','')
    city_str =' '.join(city_name.to_list())
    stitched_data['job_desig'] = stitched_data['job_desig'].apply(lambda row :remove_city(row, city_str))
    stitched_data['job_description'] = stitched_data['job_description'].apply(lambda row :remove_city(row, city_str))
    stitched_data['location_type'] = ['tier_one' if x in ['mumbai', 'bengaluru', 'delhi','delhi ncr',
                                              'chennai', 'hyderabad','gurgaon', 
                                              'gurugram', 'kolkata'] else 'tier_two' for x in stitched_data['location']] 
    
    stitched_data.drop(columns = ['location'], inplace = True)
    for col in stitched_data.columns:
        if stitched_data.dtypes[col] == 'object':
            stitched_data[col] = stitched_data[col].fillna(stitched_data[col].mode()[0])
        elif stitched_data.dtypes[col] == 'float64':
            stitched_data[col] = stitched_data[col].fillna(stitched_data[col].mean())
    
    return stitched_data

In [240]:
def EDA(stitched_data):
    '''
    creates the EDA report using pyEDA package(self created)
    '''
    from pyEDA.pyEDA.data_profile import profile
    eda_result = profile(stitched_data, get_summary=True,
                get_data_type=True,
                get_skewness={'df':True, 'plot' :False},
                get_kurtosis=None,
                get_missing={'df':True, 'plot' :False},
                get_missing_visual={'df':True, 'plot' :False},
                get_distinct={'df':True, 'plot' :True},
                get_categorical_count={'df':True, 'plot' :False},
                get_numerical_dist_plot={'df':True, 'plot' :False},
                get_numeric_box_plot={'df':True, 'plot' :False},
                get_row_wise_missing={'df':True, 'plot' :False},
                get_correlation={'df':True, 'plot' :False},
                generate_html_report=True)
    return eda_result

In [241]:
def prepare_text_col(word2vec_model, data):
    '''
    this function creates a numerical vector of the preprocessing include :
    filter_tag_pos,  lemmatize_word, stop-words, punctuation removal, number removal
    '''
    print("inside clean data")
    # cleaning Data Text columns - 'job_description', 'job_desig', 'key_skills'
    nlp_cols = ['job_description', 'job_desig', 'key_skills']    
    cleaned_text = nlp.text_clean(word2vec_model, data, nlp_cols)
    for col in nlp_cols:
        cleaned_text[col] = cleaned_text[col].apply(lambda x : abs(np.mean(x)) *10000)
    return cleaned_text

In [242]:
def data_prep(cleaned_data):
    '''
    this function does data preparation including dummy data creation and feature engineering
    '''
    cleaned_data[['exp_lower', 'exp_upper']] = cleaned_data.experience.str.split('-', expand=True)
    cleaned_data['exp_upper'] = cleaned_data.exp_upper.str.split(' ', expand=True)[0]
    cleaned_data.drop(columns = ['experience'], inplace = True)
    # get dummies
    cat_vars = ['location_type']
    for var in cat_vars:
            cat_list='var'+'_'+var
            cat_list = pd.get_dummies(cleaned_data[var], prefix=var)
            cleaned_data = cleaned_data.join(cat_list)
    cleaned_data.drop(columns =['location_type','index'], inplace = True)

In [243]:
def stitch_train_test(train_data, test_data):
    test_data.drop(columns = [clm for clm in test_data.columns if clm not in train_data.columns], inplace = True)
    target = train_data['salary']
    train_data.drop(columns = ['salary'], inplace=True)
    ind = len(train_data)
    stitched_data = train_data.append(test_data).reset_index()
    return stitched_data, target, ind

In [247]:
def find_feat_imp(x, y):
    '''
    calculate feature importance
    '''
    from sklearn.feature_selection import SelectFromModel
    sel = SelectFromModel(RandomForestRegressor(n_estimators = 100))
    sel.fit(x, y)
    selected_feat= x.columns[(sel.get_support())].tolist()
    selected_feat.append('key_skills')
    return selected_feat

In [245]:
def RF_regressor(x_train, y_train, x_test):
    '''
    model building
    '''
    from sklearn.ensemble import RandomForestRegressor
    test_data_cpy = x_test.copy()
    feat = x_train.copy()
    target = y_train['salary_final']
    X,x,Y,y = train_test_split(feat,target,random_state=119)
    print(len(X),len(x),len(Y),len(y))
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
    regressor.fit(x, y)
    y_pred = regressor.predict(x)
    
    # Checking RMSE on test set
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    rms = sqrt(mean_squared_error(y, y_pred))
    print('RMSE of RF regressor on test set: {:.2f}'.format(rms))
    
    # training on complete train set
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 
    regressor.fit(feat,target)
    # predicting on test data
    y_test_pred = regressor.predict(x_test.drop(columns = ['salary_final']))
    test_data_cpy['salary'] = y_test_pred
    test_data_cpy['salary'].to_csv("submission_rfr.csv", index = False)  
    print("RF Model Done")
    return test_data_cpy

#### main()

In [252]:
# reading in data
train_data, test_data = read_data()
print("Read Data")
stitched_data, target, ind = stitch_train_test(train_data, test_data)
print("Data Stitched")

# creating visualization and EDA report (Note :  this is a self created package, hence commented this)
#eda_result = EDA(stitched_data.drop(columns = ['index']))

# cleaning the data
cleaned_data = clean_data(stitched_data)

# preparing the data
cleaned_data = data_prep(cleaned_data)

# preparing text column
#word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#cleansed_text_data = prepare_text_col(word2vec_model, cleaned_data)
print("Data Cleansed")


#cleansed_text_data.to_csv('cleaned_text_data.csv', index = False)
cleansed_text_data = pd.read_csv('cleaned_text_data.csv')

# filling na got from word2vec processing , as all the words are not present in googles's pretrained model 
cleansed_text_data = cleansed_text_data.fillna(cleansed_text_data.min())

# making a copy of cleaned_data
#cleansed_text_data_cpy = cleansed_text_data.copy()

# creating traina nd test data from stitched data
train_data = cleansed_text_data[:ind]
test_data = cleansed_text_data[ind:]
train_data['salary'] = target
train_data[['salary_low', 'salary_upp']] = train_data.salary.str.split('to', expand=True)
train_data['salary_final'] = (train_data['salary_low'].astype(int) + train_data['salary_upp'].astype(int)) / 2
train_data.drop(columns =['salary', 'salary_low','salary_upp'], inplace = True)
test_data['salary_final'] = 0

# fetching all the column names
col = train_data.columns.tolist()

# scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_data = sc.fit_transform(train_data)
test_data = sc.transform(test_data)

# putting the scaled(numpy array) back into pandas data frame
train_data = pd.DataFrame(train_data, columns = col)
test_data = pd.DataFrame(test_data, columns = col)

# creating x_train, y_train and x_test
x_train = train_data.drop(columns=['salary_final'])
y_train = pd.DataFrame(train_data['salary_final'], columns=['salary_final'])
x_test = test_data

# with importance
imp_feat = find_feat_imp(x_train, y_train)
print("The features importance is calculated using random forest and they are : ", imp_feat)

# without feature importance fitting model
test_data_result_without_feat_imp = RF_regressor(x_train, y_train, x_test)

# with feature importance fitting model
x_train = train_data.drop(columns=['salary_final'])[['company_name_encoded', 'exp_upper','key_skills']]
y_train = pd.DataFrame(train_data['salary_final'], columns=['salary_final'])
x_test = test_data[['salary_final','company_name_encoded', 'exp_upper','key_skills']]
test_data_result_with_feat_imp = RF_regressor(x_train, y_train, x_test)

# writing the result back
test_data['salary_final'] = test_data_result_with_feat_imp['salary']
final_test_test = pd.DataFrame(sc.inverse_transform(test_data), columns = col)

final_test_test.to_csv("basic_rfr.csv", index = False)

Read Data
Data Stitched
Dropping job_type column
Data Cleansed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

The features importance is calculated using random forest and they are :  ['company_name_encoded', 'exp_upper', 'key_skills']
14251 4751 14251 4751
RMSE of RF regressor on test set: 0.30




RF Model Done
14251 4751 14251 4751
RMSE of RF regressor on test set: 0.31
RF Model Done




In [257]:
train_data, test_data = read_data()
test_data['final'] = final_test_test['salary_final']
test_data.to_csv("result.csv", index = False)