#  Data Analysis


#  **IMPORTING LIBRARIES**



In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# About This Dataset

Amidst the pandemic many people lost their jobs, with this dataset it is possible to hone the job search so that more people in need can find employment. This dataset was created by picklesueat and contains more than 2000 job listing for data analyst positions, with features such as:

1. Salary Estimate
2. Location
3. Company Rating
4. Job Description and more.


# Data Wrangling

In [None]:
dataset = pd.read_csv("../input/data-analyst-jobs/DataAnalyst.csv", index_col = 0)

In [None]:
dataset.head(5)

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.describe(include = "O")

In [None]:
dataset.columns = dataset.columns.str.replace(" ", "_")


In [None]:
dataset.head(2)

In [None]:
dataset.Job_Title.value_counts().head()

In [None]:
dataset.Job_Title.value_counts()

In [None]:
dataset.Job_Title.replace({"Sr. Data Analyst":"Senior Data Analyst", "Sr Data Analyst":"Senior Data Analyst" , 
                           "DATA ANALYST": "Data Analyst" , "Data analyst": "Data Analyst" , 
                           "Jr Data Analyst":"Junior Data Analyst"} , inplace = True)

In [None]:
dataset.Salary_Estimate.value_counts().head()

In [None]:
dataset.Company_Name.head()

In [None]:
split1 = dataset.Company_Name.str.split("\n",expand = True)
dataset["Company_Name"] = split1[0]
dataset.head()

In [None]:
split2 = dataset.Location.str.split("," , expand = True)
dataset["City"] = split2[0]
dataset["State"] = split2[1]

In [None]:
dataset.State.value_counts()

Arapahoe is a County in CO(Colorado) state.

In [None]:
dataset.loc[dataset.State.str.contains("Arapahoe") , "State"] = "CO"

In [None]:
dataset.State = dataset.State.str.strip()

In [None]:
abb = {"CA" :  "California", 
       "TX" :  "Texas",
       "NY" :  "New York",
       "IL" :  "Illinois",
       "PA" :  "Pennsylvania",
       "AZ" :  "Arizona",
       "CO" :  "Colorado",    
       "NC" :  "North California",
       "NJ" :  "New Jersey",    
       "WA" :  "Washington",
       "VA" :  "Virginia",
       "OH" :  "Ohio",
       "UT" :  "Utah",
       "FL" :  "Florida",
       "IN" :  "Indiana",
       "DE" :  "Delaware",
       "GA" :  "Georgia",
       "SC" :  "South California",    
       "KS" :  "Kansas"  }

In [None]:
dataset["State"] = dataset.State.map(abb)

In [None]:
dataset.State.value_counts()

In [None]:
dataset.Size.value_counts()

In [None]:
dataset.Revenue.value_counts()

In [None]:
dataset.Industry.value_counts().head()

In [None]:
dataset.Easy_Apply.value_counts()

In [None]:
dataset.Competitors.value_counts().head()

In [None]:
dataset.drop(columns = "Competitors" , inplace = True)
dataset.head()

In [None]:
len(dataset.loc[(dataset[dataset.columns] == -1).any(1)])

In [None]:
len(dataset.loc[(dataset[dataset.columns] == "-1" ).any(1)])

In [None]:
dataset.replace(-1, np.nan , inplace = True)

In [None]:
dataset.replace("-1" , np.nan , inplace = True)

In [None]:
dataset.info()

In [None]:
def Missing_Values_dataframe(dataset):
        null_val = dataset.isnull().sum(0)
        
        null_val_percent = 100 * dataset.isnull().sum(0) / len(dataset)
        
        null_val_df = pd.concat([null_val, null_val_percent], axis=1)
        
        null_val_df = null_val_df.rename(mapper = {0 : "Missing Values", 1 : '% of Missing Values'} , axis = "columns")
        
        null_val_df = null_val_df.sort_values(by = "% of Missing Values" , ascending = False ).round(2)
        
        return null_val_df

In [None]:
Missing_Values_dataframe(dataset)

In [None]:
dataset["Easy_Apply"] = dataset.Easy_Apply.fillna(value = False)

In [None]:
sal_samp = dataset.Salary_Estimate.str.split("-" , expand = True)
dataset["Min_salary_USD_k"] = pd.to_numeric(sal_samp[0].str.extract('(\d+)' , expand = False))
dataset["Max_salary_USD_k"] = pd.to_numeric(sal_samp[1].str.extract('(\d+)' , expand = False))

In [None]:
dataset.loc[: , ["Min_salary_USD_k","Max_salary_USD_k"]].head(5)

In [None]:
dataset["Avg_salary_USD_k"] = (dataset["Max_salary_USD_k"] + dataset["Min_salary_USD_k"]) / 2

New column(Rating_Range) is formed , this helps us in easily analysing high rated companies.

In [None]:
dataset["Rating_Range"] = pd.cut(dataset.Rating , bins= [0 , 2.75 , 4.2 ,5] , 
                                 labels = ["Low Rated" , "Medium Rated" , "High Rated"])

# Visualising Data
Rating of companies located in respective states. Box plots show the five-number summary of a set of data.

In [None]:
sns.catplot(kind = "box", x = "Rating",y = "State", data = dataset, height=7, aspect = 1.5)
plt.title("Rating of Companies VS State")

In [None]:
sns.relplot(kind = "scatter" , y = "State" ,x = "Min_salary_USD_k", hue = "Rating_Range",  
            s = 100,data = dataset ,height = 7 , aspect = 2 , cmap = 'viridis' )
plt.title("Minimum Salary by Companies Respect to State")

Grouping data by state.

* Visualizing Avg , Min , Max Salaries with respect to companies located in different states.

In [None]:
state_data = dataset.groupby("State")[["Rating","Min_salary_USD_k","Max_salary_USD_k" , "Avg_salary_USD_k"]].mean()
state_data = state_data.reset_index()
state_data.sort_values("Avg_salary_USD_k" , ascending = False , inplace = True)

In [None]:
sns.catplot(kind = "bar" , x = "Rating" ,y = "State" , data = state_data , height = 7 , aspect = 1.5 , 
           palette = 'YlGn')
plt.xlabel("Avg_Rating")
plt.title("Avg Rating of Companies respect to States ")

In [None]:
sns.catplot(kind = "bar" , x = "Min_salary_USD_k" ,y = "State" , data = state_data , height = 7 , aspect = 1.5 
            , palette = 'ch:r= -0.3,l=0.95')
plt.title("Minimum Salary by State")

In [None]:
sns.catplot(kind = "bar" , x = "Max_salary_USD_k" ,y = "State" , data = state_data , height = 7 , aspect = 1.5 
            , palette = 'YlOrRd')
plt.title("Maximum Salary by State")

In [None]:
sns.catplot(kind= "bar" , x = "Avg_salary_USD_k" , y = "State" , data = state_data , height = 7 , aspect = 1.5 ,
           palette = 'PuRd')
plt.title("Average Salary by State")

In [None]:
dataset.info()

Forming a new column Salary_Range helps us in easyily analysing data distribution of salary.

In [None]:
dataset["Salary_Range"] = pd.qcut(dataset.Avg_salary_USD_k , q = [0 , 0.4 , 0.80  , 1] ,
                                  labels = ["Low Salary" , "Medium Salary" , "High Salary"])

In [None]:
dataset.head()

Grouping data with Sector and Visualizing high paid sectors.

In [None]:
sector_data = dataset.groupby("Sector").Avg_salary_USD_k.mean()
sector_data.sort_values(ascending = False , inplace = True)
sector_data = sector_data.reset_index()
sector_data.head()

In [None]:
sns.catplot(kind = 'bar' , data = sector_data , y = "Sector" , x = "Avg_salary_USD_k", height = 7 , aspect = 1.5, 
           palette = "Purples")
plt.title("Avg Salary by Sector")

In [None]:
sector_sal_data = dataset.groupby(["Sector" , "Salary_Range"]).Avg_salary_USD_k.mean()
sector_sal_data.sort_values(ascending = False , inplace = True)
sector_sal_data = sector_sal_data.reset_index()
#pd.options.display.max_rows = 75
sector_sal_data.head(6)

In [None]:
sns.relplot(kind = 'scatter' , data = sector_sal_data , y = "Sector" , x = "Avg_salary_USD_k" , hue = 'Salary_Range'
            , height = 7 , aspect = 1.5  )

In [None]:
High_Demand_jobs = dataset.Job_Title.value_counts().head(25)
High_Demand_jobs = High_Demand_jobs.reset_index()

In [None]:
High_Demand_jobs.rename(columns = {"index" : "Job_Title" , "Job_Title" : "No_of_Companies"} , inplace = True)

In [None]:
sns.catplot(kind = 'bar' , x = "No_of_Companies" , y = "Job_Title" , data = High_Demand_jobs, height = 7 , aspect = 2)
plt.title("High_Demand_jobs")

In [None]:
from wordcloud import WordCloud

In [None]:
Job_Title=dataset['Job_Title'][~pd.isnull(dataset['Job_Title'])]
wordCloud = WordCloud(width=450,height= 300).generate(' '.join(Job_Title))
plt.figure(figsize=(15,10))
plt.axis('off')
plt.title(dataset['Job_Title'].name,fontsize=20)
plt.imshow(wordCloud)

Now let's Find out Best Jobs by

* salary
* company rating.
* step by step

In [None]:
high_paid_company_data = dataset.groupby("Company_Name").Avg_salary_USD_k.mean().sort_values(ascending = False).head(250)
high_paid_company_data.head()

In [None]:
comp_rata_data = dataset.loc[: , ["Company_Name" , "Rating_Range", "Job_Title","State"]]

In [None]:
#pd.options.display.max_rows = 350
High_Paid_jobs = comp_rata_data.merge(high_paid_company_data , how = "inner" , on = "Company_Name")
High_Paid_jobs = High_Paid_jobs.sort_values("Avg_salary_USD_k", ascending = False)
High_Paid_jobs.head(2)

In [None]:
High_Paid_jobs_in_high_rated_company = High_Paid_jobs.loc[High_Paid_jobs.Rating_Range == "High Rated"]
High_Paid_jobs_in_high_rated_company.head(2)

In [None]:
High_Paid_jobs_in_high_rated_company = High_Paid_jobs_in_high_rated_company.sort_values(by = "Avg_salary_USD_k" ,
                                                                                        ascending = False , ignore_index = True)
High_Paid_jobs_in_high_rated_company.head(2)

In [None]:
Top_30_High_Paid_jobs_in_high_rated_company = High_Paid_jobs_in_high_rated_company.nlargest(30 , "Avg_salary_USD_k")
Top_30_High_Paid_jobs_in_high_rated_company.head(2)

In [None]:
Top_23 = Top_30_High_Paid_jobs_in_high_rated_company.groupby("Job_Title").Avg_salary_USD_k.mean()
Top_23 = Top_23.reset_index()
Top_23 = Top_23.sort_values("Avg_salary_USD_k" , ascending = False , ignore_index = True)
Top_23

Visualizing Best 23 Jobs in High Rated Companies.

In [None]:
sns.catplot(kind = "bar" , x = "Avg_salary_USD_k"  ,y = "Job_Title" , data = Top_23 , height = 6 , aspect = 2 , 
           palette = "Reds")
plt.title("High Paid Jobs in High Rated Companies")