In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')
from pandas_profiling import ProfileReport
from matplotlib import pyplot as plt

# reading the data 

In [None]:
train_data=pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test_data=pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
print("columns of train data :\n" ,train_data.columns)
print("-------------------------------------------------------")
print("columns of test data :\n",test_data.columns )

In [None]:
print('Missing values in train dataset:\n\n', train_data.isnull().sum())
print("------------------------------------------")
print('\n\nMissing values in test dataset:\n\n', test_data.isnull().sum())

It seems that the data we have here needs a lot of work, and there is a problem that we are working on two separate files so the work will be difficult.

In order to get a detailed report on the data we have, we called the amazing tool **pandas_profiling** to do this task.
And through the report that we obtained, we were able to obtain high-quality results about the data, **so it is better to see them.**

In [None]:
profile_train_data=ProfileReport(train_data , title="Job Change of Data Scientists training_data profiling report ")
profile_test_data=ProfileReport(train_data , title="Job Change of Data Scientists testing_data profiling report ")

In [None]:
profile_train_data

In [None]:
profile_test_data

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

Now let's look at the types of data and their details.
# Let's go deeper into the data.

In [None]:
train_data.info()

In [None]:
test_data.info()

> ****Here we will get the percentages of loss in the data that we have previously obtained in the report.****

In [None]:

# Top 15 features with missing data for train data 

sns.set_style("whitegrid")
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,4))
df=pd.Series(1 - train_data.count() / len(train_data)).sort_values(ascending=False).head(20)
sns.barplot(x=df.index, y=df,palette="Blues_d")
plt.xticks(rotation=90)


In [None]:
sns.heatmap(train_data.isnull(),cbar=False, cmap='viridis')

In [None]:
# Top 5 features with missing data for test data 

sns.set_style("whitegrid")
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15,4))
df=pd.Series(1 - test_data.count() / len(test_data)).sort_values(ascending=False).head(20)
sns.barplot(x=df.index, y=df,palette="Blues_d")
plt.xticks(rotation=90)


In [None]:
sns.heatmap(test_data.isnull(),cbar=False, cmap='viridis')

# Let's make it look better

In [None]:
#missing data for train data 
total = train_data.isnull().sum().sort_values(ascending=False)
percent = (train_data.isnull().sum()/train_data.shape[0]).sort_values(ascending=False)

missin_train = pd.concat([total, percent], axis=1, keys=['Total', 'Perc_missing'])
missin_train.head(10)



In [None]:
#missing data for train data 
total = test_data.isnull().sum().sort_values(ascending=False)
percent = (test_data.isnull().sum()/test_data.shape[0]).sort_values(ascending=False)

missing_test = pd.concat([total, percent], axis=1, keys=['Total', 'Perc_missing'])
missing_test.head(10)


In [None]:
train_data["gender"].dropna(inplace=True)
train_data["company_type"].dropna(inplace=True)
train_data["company_size"].dropna(inplace=True)
train_data.drop(['enrollee_id'], axis = 1, inplace = True)
train_data.dropna(inplace=True)
#aug_test.dropna(inplace=True)

In [None]:
train_data.isnull().sum()

In [None]:
print(list(train_data.columns))
print("------------------------------------------------------")
print(train_data.shape)

In [None]:
test_data.dropna(inplace=True)
test_data.drop(['enrollee_id'], axis = 1, inplace = True)

In [None]:
test_data.isnull().sum()

In [None]:
print(list(test_data.columns))
print("--------------------------------------------------")
print(test_data.shape)

****Now I will modify the values inside the columns.****

In [None]:
train_data['experience'].value_counts()

We notice some unwanted values such as "<1" and ">20"
We will convert these values.

In [None]:
def replacment(experience):
    if experience == '>20':
        return 21
    elif experience == '<1':
        return 0

    else:
        return experience

In [None]:
train_data.experience = train_data.experience.map(replacment)

In [None]:
train_data['experience'].value_counts()

In [None]:
train_data['last_new_job'].unique()

**We will work to change or delete those unwanted values.**

In [None]:
train_data['last_new_job'].value_counts()

In [None]:
# We will use the same function.
def replacement_2(last_new_job):
    if last_new_job == '>4':
        return 5
    elif last_new_job == 'never':
        return 0

    else:
        return last_new_job

train_data.last_new_job = train_data.last_new_job.map(replacement_2)

In [None]:
train_data['last_new_job'].unique()

# Now we will review one of the most important columns, which is the target column.

In [None]:
value = train_data['target'].value_counts().values.tolist()
labels = train_data['target'].value_counts().index
plt.figure(figsize= (5,5))
plt.title('The ratio of each component to the target column.')
plt.pie(x = value, labels = labels, autopct='%1.f%%', pctdistance= .5)
plt.show()

In [None]:
train_data.corr()

In [None]:
# Now we will draw a heat map.
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(train_data.corr(), annot=True, linewidths=.5, ax=ax)
plt.show()

In [None]:

table = pd.pivot_table(train_data,index=['education_level','target','gender'])
print(table)
table.plot(kind='hist',
           figsize = (15,10),
           colormap ="Dark2")

I will do machine learning algorithms for that data later.

# Thank you very much