In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

import os
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
TRAIN_PATH = "/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv"
TEST_PATH = "/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv"

In [None]:
df = pd.read_csv(TRAIN_PATH)
df.head()

In [None]:
df['relevent_experience'].dropna().unique()

## Exploratory Data Analysis

In [None]:
print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
print("*"*50, end="\n\n")
      
print(f"Dataset has {len(df['enrollee_id'].dropna().unique())} unique user's data.")
print("*"*50, end="\n\n")

### City and its development index analysis

In [None]:
print(f"Dataset has {len(df['city'].dropna().unique())} unique cities.")
print("_"*20)
print(f"Unique cities count:\n{df['city'].value_counts()}")
print("*"*50, end="\n\n")


fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(ax=ax, x=df['city'].value_counts().index[:10], y=df['city'].value_counts().values[:10], capsize=.2,palette="Blues_d")

ax.set_title("Unique city count bar chart",fontsize=16)
ax.set_xlabel("City",fontsize=14)
ax.set_ylabel("Count",fontsize=14)

plt.show()

In [None]:
print(f"Dataset has {len(df['city_development_index'].dropna().unique())} unique city development indices.")
print("_"*20)
print(f"Unique City Development Indices:\n{df['city_development_index'].value_counts()}")
print("*"*50, end="\n\n")

## line plot
# fig, ax = plt.subplots(figsize=(16, 6))
# sns.lineplot(ax=ax, x=np.linspace(0, len(df), len(df), endpoint=True), y=df['city_development_index'].values)

# ax.set_title("todo",fontsize=16)
# ax.set_xlabel("Enrollee",fontsize=14)
# ax.set_ylabel("Count",fontsize=14)

# plt.show()

In [None]:
plt.figure(figsize = (10, 6))

ax = df['city_development_index'].plot(kind='kde')
ax.set(ylabel="Density", xlabel="Development Index")
plt.title("KDE plot for Development Index") 
plt.savefig("./dev_index_density.png", fontsize=15)
plt.show()

In [None]:
city_with_dev = df.groupby(['city']).mean()['city_development_index'].reset_index()
city_with_dev = city_with_dev.sort_values(by=['city_development_index'], ascending=False).reset_index(drop=True)
city_with_dev.head()

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
sns.barplot(ax=ax, x=city_with_dev['city'][:15], 
            y=city_with_dev['city_development_index'][:15], 
            palette="Blues_d")

ax.set_title("Top 15 cities with best development index bar chart",fontsize=16)
ax.set_xlabel("City",fontsize=14)
ax.set_ylabel("Development Index",fontsize=14)

plt.show()

In [None]:
city_target = df.groupby(['target']).mean()['city_development_index'].reset_index()
city_target = city_target.sort_values(by=['city_development_index'], ascending=False).reset_index(drop=True)
print(city_target)

fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(ax=ax, 
            x=city_target['target'], 
            y=city_target['city_development_index'])

ax.set_title("Target based city's development index",fontsize=16)
ax.set_xlabel("Target",fontsize=14)
ax.set_ylabel("Development Index",fontsize=14)

plt.show()

### Gender Analysis

In [None]:
print(f"Dataset has {len(df['gender'].dropna().unique())} unique gender's data.")
print("_"*20)
print(f"Unique Gender counts:\n{df['gender'].value_counts()}")
print("*"*50, end="\n\n")

total = df.shape[0]
total_male = df.query("gender == 'Male'")
total_female = df.query("gender == 'Female'")
total_other = df.query("gender == 'Other'")

male_percent = round(len(total_male)*100/total, 3)
female_percent = round(len(total_female)*100/total, 3)
other_percent = round(len(total_female)*100/total, 3)

labels = 'Male Percentage', 'Female Percentage', 'Other Percentage'
sizes = [male_percent, female_percent, other_percent]
explode = (0.05, 0.05, 0.05)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title('Gender spread in Data', fontsize=15)
plt.savefig("./gender_pie.png")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(6, 10))
sns.violinplot(ax=ax, x=df['target'], y=df['gender'],
               split=True, inner="quart", linewidth=1,
               palette={"Male": "b", "Female": "g", "Other": "r"})

ax.set_title("Gender based on target violin plot",fontsize=16)
ax.set_xlabel("Target",fontsize=14)
ax.set_ylabel("Gender",fontsize=14)

plt.show()

In [None]:
total = len(df.query("target == 1.0"))
total_male_target = df.query("gender == 'Male' and target == 1.0")
total_female_target = df.query("gender == 'Female' and target == 1.0")
total_other_target = df.query("gender == 'Other' and target == 1.0")

male_target_percent = round(len(total_male_target)*100/total, 3)
female_target_percent = round(len(total_female_target)*100/total, 3)
other_target_percent = round(len(total_other_target)*100/total, 3)

labels = 'Male Percentage', 'Female Percentage', 'Other Percentage'
sizes = [male_target_percent, female_target_percent, other_target_percent]
explode = (0.1, 0.1, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title('Gender with target = 1.0', fontsize=15)
plt.savefig("./gender_target_1_pie.png")
plt.show()

In [None]:
total = len(df.query("target == 0.0"))
total_male_target = df.query("gender == 'Male' and target == 0.0")
total_female_target = df.query("gender == 'Female' and target == 0.0")
total_other_target = df.query("gender == 'Other' and target == 0.0")

male_target_percent = round(len(total_male_target)*100/total, 3)
female_target_percent = round(len(total_female_target)*100/total, 3)
other_target_percent = round(len(total_other_target)*100/total, 3)

labels = 'Male Percentage', 'Female Percentage', 'Other Percentage'
sizes = [male_target_percent, female_target_percent, other_target_percent]
explode = (0.1, 0.1, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title('Gender with target = 0.0', fontsize=15)
plt.savefig("./gender_target_0_pie.png")
plt.show()

In [None]:
print(f"Dataset has {len(df['relevent_experience'].dropna().unique())} unique relevant experience data.")
print("_"*20)
print(f"Unique Experiences:\n{df['relevent_experience'].value_counts()}")
print("*"*50, end="\n\n")


fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(ax=ax, 
            x=df['relevent_experience'].value_counts().index, 
            y=df['relevent_experience'].value_counts().values)

ax.set_title("Total Data for Relevant experience",fontsize=16)
ax.set_xlabel("Relevent Experience",fontsize=14)
ax.set_ylabel("Data Count",fontsize=14)

plt.show()

In [None]:
total = len(df.query("target == 1.0"))
total_experience_target = df.query("relevent_experience == 'Has relevent experience' and target == 1.0")
total_inexperience_target = df.query("relevent_experience == 'No relevent experience' and target == 1.0")

experience_target_percentage = round(len(total_experience_target)*100/total, 3)
inexperience_target_percentage = round(len(total_inexperience_target)*100/total, 3)


labels = 'Has relevent experience', 'No relevent experience'
sizes = [experience_target_percentage, inexperience_target_percentage]
explode = (0.05, 0.05)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title('Relevant Experience with target = 1.0', fontsize=15)
plt.savefig("./experience_target_1_pie.png")
plt.show()

In [None]:
total = len(df.query("target == 0.0"))
total_experience_target = df.query("relevent_experience == 'Has relevent experience' and target == 0.0")
total_inexperience_target = df.query("relevent_experience == 'No relevent experience' and target == 0.0")

experience_target_percentage = round(len(total_experience_target)*100/total, 3)
inexperience_target_percentage = round(len(total_inexperience_target)*100/total, 3)


labels = 'Has relevent experience', 'No relevent experience'
sizes = [experience_target_percentage, inexperience_target_percentage]
explode = (0.05, 0.05)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title('Relevant Experience with target = 0.0', fontsize=15)
plt.savefig("./experience_target_0_pie.png")
plt.show()

In [None]:
print(f"Dataset has {len(df['enrolled_university'].dropna().unique())} unique enrolled university data.")
print("_"*20)
print(f"Unique enrolled university:\n{df['enrolled_university'].value_counts()}")
print("*"*50, end="\n\n")

In [None]:
print(f"Dataset has {len(df['education_level'].dropna().unique())} unique education level data.")
print("_"*20)
print(f"Unique education level:\n{df['education_level'].value_counts()}")
print("*"*50, end="\n\n")

In [None]:
print(f"Dataset has {len(df['major_discipline'].dropna().unique())} unique major discipline data.")
print("_"*20)
print(f"Unique major discipline:\n{df['major_discipline'].value_counts()}")
print("*"*50, end="\n\n")

In [None]:
print(f"Dataset has {len(df['experience'].dropna().unique())} unique experience data.")
print("_"*20)
print(f"Unique experiences:\n{df['experience'].value_counts()}")
print("*"*50, end="\n\n")

In [None]:
print(f"Dataset has {len(df['company_size'].dropna().unique())} unique company sizes.")
print("_"*20)
print(f"Unique company sizes:\n{df['company_size'].value_counts()}")
print("*"*50, end="\n\n")

In [None]:
print(f"Dataset has {len(df['company_type'].dropna().unique())} unique company types.")
print("_"*20)
print(f"Unique company types:\n{df['company_type'].value_counts()}")
print("*"*50, end="\n\n")

### Understanding Distributions

In [None]:
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()
    
    
plotPerColumnDistribution(df, 10, 5)

### Understanding Correlations

In [None]:
corr = df.corr()
plt.figure(num=None, figsize=(6, 6), dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
plt.title(f'Correlation Matrix for all Transactions', fontsize=15)
plt.savefig("./correlation.png")
plt.show()

### My current gears:

<table style="width:100%">
  <tr>
    <th>HP Z8 G4 Tower - 1125W PSU</th>
    <th>HP ZBook Studio - G7 Mobile Workstation:</th>
  </tr>
  <tr>
    <td>6234 3.3 GHz (8 Core each) i9 Processors x 2</td>
    <td>6234 3.3 GHz (8 Core) i9 Processor x 1</td>
  </tr>
  <tr>
    <td>NVIDIA Quadro RTX 8000 x 1</td>
    <td>NVIDIA Quadro RTX 5000 x 1</td>
  </tr>
  <tr>
    <td>96GB DDR4 RAM 2933</td>
    <td>32GB DDR4 RAM 2933</td>
  </tr>
    
  <tr>
    <td>2 TB NVMe M.2 SSD</td>
    <td>2 TB NVMe M.2 SSD</td>
  </tr>
  <tr>
    <td><img src= "https://ssl-product-images.www8-hp.com/digmedialib/prodimg/lowres/c05724976.png?imdensity=1&imwidth=1024" width=200px></td>
    <td><img src="https://www8.hp.com/content/dam/sites/worldwide/personal-computers/commercial/workstations/zbook-studio/images/color-accuracy-image-desktop.png" width=200px></td>
  </tr>
</table>


### Thanks :)