In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading the Data

In [None]:
df = pd.read_csv('/kaggle/input/google-job-skills/job_skills.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df["Minimum Qualifications"][1]

## Data Cleaning

In [None]:
df.isnull().sum()

#### removing Missing Values

In [None]:
df = df.dropna(how='any',axis='rows')

In [None]:
df.shape

#### Tokenize

In [None]:
from nltk.tokenize import word_tokenize 

df['Responsibilities'] = df.Responsibilities.apply(lambda x: word_tokenize(x))
df['Minimum Qualifications'] = df['Minimum Qualifications'].apply(lambda x: word_tokenize(x))
df['Preferred Qualifications'] = df['Preferred Qualifications'].apply(lambda x: word_tokenize(x))

#### Removing Stopwords

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df['Responsibilities'] = df['Responsibilities'].apply(lambda x: [w for w in x if w not in stop_words])
df['Minimum Qualifications'] = df['Minimum Qualifications'].apply(lambda x: [w for w in x if w not in stop_words])
df['Preferred Qualifications'] = df['Preferred Qualifications'].apply(lambda x: [w for w in x if w not in stop_words])

#### Split

In [None]:
df['Preferred Qualifications'] = df['Preferred Qualifications'].apply(lambda x: ' '.join(x))
df['Minimum Qualifications'] = df['Minimum Qualifications'].apply(lambda x: ' '.join(x))
df['Preferred Qualifications'] = df['Preferred Qualifications'].apply(lambda x: ' '.join(x))

## Which qualifications are requested by Google

### 1- Languges

Most popular languease is Pyhton.

In [None]:
programing_language_list = ['go','r', 'sas', 'matlab','stata','python', 'java','net', 'c++','html','css', 'php', \
                            'javascript', 'objective-c', 'ruby', 'perl','c','c#', 'sql','mysql','mapreduce','hadoop','kotlin']

In [None]:
min_qualifications = df['Minimum Qualifications'].tolist()
min_qualifications_string = ''.join(map(str, min_qualifications)).lower()

In [None]:
import re
languages_count = dict((keys,0) for keys in programing_language_list)
for w in re.findall(r"[\w'+#-]+|[.!?;’]", min_qualifications_string):
    if w in languages_count:
        languages_count[w] += 1

print(languages_count)

In [None]:
popular_languages = pd.DataFrame(languages_count.items(), columns=['Language', 'Popularity'])

In [None]:
popular_languages.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(15,4))
sns.barplot(x = "Language" , y = "Popularity"  ,data = popular_languages )

In [None]:
plt.figure(figsize=(35,24))
sns.catplot(x = "Language" , y = "Popularity" , data = popular_languages ,height=8.27, aspect=11.7/8.27)

### 2- Location

#### Country

In [None]:
df["Location"].head()

In [None]:
df["Country"] = df["Location"].str.split(',').str[-1]

In [None]:
df["Country"]

In [None]:
country_value_counts = df.Country.value_counts()
country_value_counts= country_value_counts[country_value_counts.values>=10]

In [None]:
df_popular_countries=pd.Series.to_frame(country_value_counts).reset_index()
df_popular_countries.columns=['Country', 'Preference']
df_popular_countries.head()

In [None]:
plt.figure(figsize=(15,4))
sns.barplot(x = "Country" , y = "Preference"  ,data = df_popular_countries )

#### City

In [None]:
df["City"] = df["Location"].str.split(',').str[0]
df["City"].head()

In [None]:
city_value_counts = df.City.value_counts()
city_value_counts= city_value_counts[city_value_counts.values>=10]

In [None]:
df_popular_cities=pd.Series.to_frame(city_value_counts).reset_index()
df_popular_cities.columns=['City', 'Preference']
df_popular_cities.head()

In [None]:
plt.figure(figsize=(30,8))
sns.barplot(x = "City" , y = "Preference"  ,data = df_popular_cities )

## 3- Degree

In [None]:
degree_list = ["ba", "bs", "bachelor's", "phd",'mba','bachelor','student' ]


In [None]:
degree_count = dict((x,0) for x in degree_list)
for w in re.findall(r"[\w']+|[.,!?;’]", min_qualifications_string):
    if w in degree_count:
        degree_count[w] += 1
# print
print(degree_count)

In [None]:
df_degree_popular = pd.DataFrame.from_dict(degree_count, orient='index').sort_values(by=0,ascending=False).reset_index()
df_degree_popular.columns=['Degree', 'Popularity']
df_degree_popular['Degree'] = df_degree_popular.Degree.str.upper() 
df_degree_popular

In [None]:
plt.figure(figsize=(30,8))
sns.barplot(x = "Degree" , y = "Popularity"  ,data = df_degree_popular )

## 4 - Years of Experience

In [None]:
from collections import defaultdict
years_exp = defaultdict(lambda: 0)
for w in re.findall(r'([0-9]+) year', min_qualifications_string):
     years_exp[w] += 1
        
print(years_exp)

df_years_exp = pd.DataFrame.from_dict(years_exp, orient='index').sort_values(by=0,ascending=False).reset_index()
df_years_exp.columns=['Years of Experience', 'Popularity'] 

df_years_exp.head()

In [None]:
plt.figure(figsize=(30,8))
sns.barplot(x = "Years of Experience" , y = "Popularity"  ,data = df_years_exp ,order = df_years_exp['Years of Experience'] )

## 5 - Popular Category

In [None]:
category_value_counts = df.Category.value_counts()

df_popular_category=pd.Series.to_frame(category_value_counts).reset_index()
df_popular_category.columns=['Category', 'Preference']
df_popular_category.head()

In [None]:
plt.figure(figsize=(30,18))
sns.barplot(x = "Preference" , y = "Category"  ,data = df_popular_category )