![Engineers](https://propakistani.pk/wp-content/uploads/2020/05/Cambridge.jpg)

# **1. Importing required libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
from wordcloud import WordCloud

sns.set_style("darkgrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **2. Reading Data**

In [None]:
engineer_data = pd.read_csv("/kaggle/input/pakistan-engineers-data/PITC_Engineering.csv")

In [None]:
engineer_data.head()

# **3. Dropping Unnecessary columns**


In [None]:
engineer_data.drop(["Unnamed: 0", "ID",'PEC_No','License_no'], axis = 1, inplace = True)

In [None]:
engineer_data.head()

# **4. Meta information of dataframe**

## **4.1- Shape of dataframe**

In [None]:
print(f"Shape of dataframe is: {engineer_data.shape}")

## **4.2- Datatype of features**

In [None]:
engineer_data.dtypes

In [None]:
engineer_data.info()

# **5. Data Cleaning**

## **5.1- Re-naming *Employment_period***

In [None]:
engineer_data.rename({"Employment_period" : "Employment_period(days)"},
                     axis = 1,
                     inplace = True)

## **5.2- Converting dataype of *Employment_Start_Date* & *Employment_End_Date* into datetime**

In [None]:
engineer_data['Employment_Start_Date'] = pd.to_datetime(engineer_data['Employment_Start_Date'])
engineer_data['Employment_End_Date'] = pd.to_datetime(engineer_data['Employment_End_Date'])

## **5.3- Converting datatype of Employment_period(days) feature**

In [None]:
engineer_data['Employment_period(days)'] = engineer_data['Employment_period(days)'].str.replace('days','').astype(np.float).astype("Int64")

## **5.4- Checking for NaN values**

In [None]:
plt.title("Numbers of missing values in Each Column\n\n", size = 30)
msno.bar(engineer_data, fontsize = 20, color="dodgerblue", sort="descending");

In [None]:
plt.figure(figsize = (10,5))
plt.title("Missing Values in Dataframe\n", size = 15)

sns.heatmap(engineer_data.isnull(), yticklabels=False, cbar = False);

In [None]:
for col in engineer_data.columns:
    missing_percent = np.round((engineer_data[col].isnull().sum() * 100) / engineer_data.shape[0], 3)
    print(f'{missing_percent} % data is missing in `{col}` column')

In [None]:
def msno_table(dataframe):
        # Total missing values
        mis_val = dataframe.isnull().sum()
        
        # Percentage of missing values
        missing_percent = (dataframe.isnull().sum() * 100) / len(dataframe)
        
        # Make a table with the results
        mis_val_df = pd.concat([mis_val, missing_percent], axis = 1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_df.rename(
            columns = {0 : 'Missing Values',
                       1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending = False).round(3)
        
        # Print some summary information        
        print(f'Selected dataframe has total {dataframe.shape[1]} columns \n There are {mis_val_table_ren_columns.shape[0]} columns that contain missing values\n')
        
        # Return the dataframe with missing information        
        return mis_val_table_ren_columns

In [None]:
msno_table(engineer_data)

## **5.5- Checking minimum and maximum Employment_period(days)**

In [None]:
engineer_data['Employment_period(days)'].max()

In [None]:
engineer_data['Employment_period(days)'].min()

**Minimum days in Employment_period is *-51*, which does not make any sense, So fixing this issue**

## **5.6- Checking for total records that contain negative value in Employment_period(days)**

In [None]:
negative_Employment_period = engineer_data[engineer_data['Employment_period(days)'] < 0]

negative_Employment_period

In [None]:
negative_Employment_period['Employment_End_Date'] - negative_Employment_period['Employment_Start_Date']

**Total 4 records are present in dataset that has Employment_period(days) value in negative. Also if we look at Employment_Start_Date and Employment_End_Date, then Employment_End_Date is before the Employment_Start_Date which is the root cause of negative value in Employment_period(days)**

In [None]:
plt.figure(figsize = (10,7))
plt.plot(negative_Employment_period['Employment_Start_Date'], label = "Start Date", marker = 'o', color = 'b')
plt.plot(negative_Employment_period['Employment_End_Date'], label = "End Date", color = 'r', marker = 'o')

plt.legend();


**Removing these 4 records from data**

In [None]:
engineer_data = engineer_data.drop(negative_Employment_period.index).reset_index().drop('index', axis = 1)

In [None]:
engineer_data[engineer_data['Employment_period(days)'] < 0]

**All records that has negative value in `Employment_period(days)` and `Employment_Start_Date` is after the `Employment_End_Date` has been removed**

# **5.7- Imputing missing values**

In [None]:
engineer_data['Engineer_Name'].fillna("Unknown",
                                      inplace = True)

engineer_data['Father_Name'].fillna("Unknown",
                                    inplace = True)

engineer_data['Company_Name'].fillna(engineer_data['Company_Name'].mode()[0],
                                     inplace = True)

engineer_data['Engineering_Degree'].fillna(engineer_data['Engineering_Degree'].mode()[0],
                                           inplace = True)

engineer_data['Employment_period(days)'].fillna(engineer_data['Employment_period(days)'].median(),
                                                inplace = True)

engineer_data['Firm_Category'].fillna(engineer_data['Firm_Category'].mode()[0],
                                      inplace = True)

engineer_data['Firms_City'].fillna(engineer_data['Firms_City'].mode()[0],
                                      inplace = True)

engineer_data['Province'].fillna(engineer_data['Province'].mode()[0],
                                      inplace = True)

engineer_data["Latitude"].interpolate(method='linear', inplace=True)

engineer_data["Longitude"].interpolate(method='linear', inplace=True)

### **All records where Employment_Start_Date or Employment_End_Date is NaT**

In [None]:
engineer_data[(engineer_data['Employment_Start_Date'].isnull()) | (engineer_data['Employment_End_Date'].isnull())][['Employment_Start_Date','Employment_End_Date','Employment_period(days)','Engineering_Degree']].head(20)

**Filling NaT values in Employment_Start_Date and Employment_End_Date according to Employment_period(days) and with existing dates whose Employment_period(days) is 330**

In [None]:
engineer_data[engineer_data['Employment_period(days)'] == 330]

**We can see that the Employment_Start_Date of every records is `2019-08-05` where  Employment_period(days) is 330, And Employment_Start_Date of each record is `2020-06-30`. So filling missing values with these dates**

In [None]:
engineer_data['Employment_Start_Date'].fillna('2019-08-05', inplace = True)

engineer_data['Employment_End_Date'].fillna('2020-06-30', inplace = True)

In [None]:
plt.figure(figsize = (10,5))

sns.heatmap(engineer_data.isnull(), yticklabels=False, cbar = False);

**All the missing values has imputed**

In [None]:
engineer_data['Employment_Start_Date'] = pd.to_datetime(engineer_data['Employment_Start_Date'])
engineer_data['Employment_End_Date'] = pd.to_datetime(engineer_data['Employment_End_Date'])

In [None]:
engineer_data.dtypes

In [None]:
engineer_data.head()

# **6- Visualization**

## **6.1- Analyzing Engineering_Degree**

In [None]:
px.histogram(data_frame = engineer_data,
             x = 'Engineering_Degree',             
             title = "<b>Counts of Engineering Degree</b>",
             color = "Engineering_Degree")

In [None]:
fig = px.pie(engineer_data,
             names='Engineering_Degree',width=800, height=600, hole=0.5,)

fig.show()

**CIVIL Engineering has most count with value 5987**

## **6.2- Analyzing Firm_Category**

In [None]:
px.histogram(engineer_data, x = 'Firm_Category',
            color = "Firms_City")

**CiVIL Engineering has most counts and Islamabad is a City where most of the Firms are located**

## **6.3- Analyzing Employment period**

In [None]:
desending = engineer_data.sort_values('Employment_period(days)', ascending = False).head(10)
desending.head()

In [None]:
ascending = engineer_data.sort_values('Employment_period(days)', ascending = True).head(10)
ascending.head()

### **6.3.1- Analyzing Companies where Employee stay for long period**

In [None]:
px.scatter(data_frame=desending,
           x = 'Engineer_Name',
           y = 'Employment_period(days)',
           color='Company_Name',
          hover_data=['Firms_City','Engineering_Degree'])

### **6.3.2- Analyzing Companies where Employee doesn't stay for long period**

In [None]:
px.scatter(data_frame=ascending,
           x = 'Engineer_Name',
           y = 'Employment_period(days)',
           color='Company_Name',
          hover_data=['Firms_City','Engineering_Degree'])

## **6.4- Analyzing Engineering_Degree and Firm City**

In [None]:
fig = px.treemap(engineer_data, path=['Engineering_Degree', 'Firms_City'],
                  color='Province',
                  hover_data=['Province'])
fig.show()

## **6.5- Province where most firms are located**

In [None]:
fig = px.sunburst(engineer_data, path=['Province', 'Firms_City'],   
                  hover_data = ['Province'],)


fig.show()

## **6.6- Cities of Pakistan where most Firms are located**

In [None]:
fig = px.treemap(engineer_data, path=[px.Constant("Pakistan"), 'Firms_City'],
                  color='Firms_City',)
fig.show()

**From above graph, most of the firms are located in Islamabad, then lahore, then karachi and so on...**

## **6.7- Top Words in Engineering Degree**

In [None]:
text = ' '.join(engineer_data['Engineering_Degree'].str.lower())

wordcloud = WordCloud(width = 2000, height = 900, 
                      background_color ='black',
                      collocations=False,
                      max_words=5000,
                      min_font_size = 20).generate(text)
    
plt.figure(figsize=(12, 8), facecolor = 'k', edgecolor = 'k' )
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()

## **6.8- Top Words in Company Name**

In [None]:
text = ' '.join(engineer_data['Company_Name'].str.lower())

wordcloud = WordCloud(width = 2000, height = 1000, 
                      background_color ='black',
                      collocations=False,
                      max_words=5000,
                      min_font_size = 20).generate(text)
    
plt.figure(figsize=(12, 8), facecolor = 'k', edgecolor = 'k' )
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()

<h2 style="text-align:center;background-color:#6cdcf5">I Hope you like this kernel. If so, don't forget to upvote and leave your valuable comment. Thank you :)</h2>
