In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import important libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pakistan Intellectual Capital Task

1- Which area of interest/expertise is in abundance in Pakistan and where we need more people?

2- How many professors we have in Data Sciences, Artificial Intelligence, or Machine Learning?

3- Which country and university hosted majority of our teachers?

4- Which research areas were most common in Pakistan?

5- How does Pakistan Student to PhD Professor Ratio compare against rest of the world, especially with USA, India and China?

6- Any visualization and patterns you can generate from this data



In [None]:
original_data = pd.read_csv('/kaggle/input/pakistanintellectualcapitalcs/Pakistan Intellectual Capital - Computer Science - Ver 1.csv', encoding="latin1")
df = original_data.copy()  # making copy of data
df.head()

In [None]:
# Rename the columns names
df = df.rename(columns={
    'Teacher Name' : 'teacher_name',
    'University Currently Teaching' : 'current_university',
    'Department': 'department',
    'Province University Located': 'province',
    'Designation': 'designation',
    'Terminal Degree' : 'degree',
    'Graduated from': 'graduated_from',
    'Country': 'country',
    'Year': 'year',
    'Area of Specialization/Research Interests': 'specialization',
    'Other Information' : 'other_information'
})
df.head()


In [None]:
# Basic information data
df.info()

In [None]:
# total number of rows and columns
Row, Col = df.shape
print(f'There are {Row} Rows and {Col} columns')
df.shape

In [None]:
# Drop S# it is not use 
# Also drop other_information because it is all the value null
df.drop(['S#','other_information'],axis=1, inplace=True)
# checking null value
df.isnull().sum()

**Task 1**

Which area of interest/expertise is in abundance in Pakistan and where we need more people?

In [None]:
specializations = pd.DataFrame(df['specialization'])
specializations['specialization'].isnull().sum()  # Number of null values in 'specialization' column

In [None]:
# Drop the row with null values
specializations.dropna(inplace=True)

In [None]:
# Convert into lowercase
specializations['specialization'] = specializations.specialization.str.lower()
# Replace all periods "."
specializations['specialization'] = specializations.specialization.str.replace('.', '')
# Replace all and 
specializations['specialization'] = specializations.specialization.str.replace('and', '')
# As majority of the records in this cloumn have multiple areas of interets separated by commas ',' 
# Hence, splitting the records on the basis of commas ','
specializations['specialization'] = specializations.specialization.str.split(',')


In [None]:
# Now make a list which contain area of interest individualy
area_list = []

for i in specializations['specialization']:
    for j in i:
        area_list.append(j.strip())
df_area_interest = pd.DataFrame(area_list)


In [None]:
# Here we change the column name 0 to area_of_interest and counting the area_of_interest value
df_area_interest = df_area_interest.rename(columns={0:'area_of_interest'})
task1_area_interest = pd.DataFrame(df_area_interest.area_of_interest.value_counts())

In [None]:
# Here we reset the index order
area_of_interest = task1_area_interest.reset_index()
# Here we change the columns names
area_of_interest = area_of_interest.rename(columns={'index':'area_of_interest','area_of_interest':'count'})

In [None]:
# Now here we show area of interest top 10 plots
plt.figure(figsize=(15,8))
sns.barplot(x=area_of_interest.loc[0:9,'area_of_interest'],y=area_of_interest.loc[0:9,'count'])
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('Area of Interests', fontsize=15)
plt.ylabel('Counts', fontsize=15)
plt.title('Top Area of Interest',fontsize=20)
plt.show()

**Task - 2**

How many Professors we have in Data Sciences, Artifical Intelligence, or Machine Learning

In [None]:
data_science = 0
for i in area_of_interest['area_of_interest']:
    if i == 'data science':
        data_science +=1
print("Data Science:", data_science)

machine_learning = 0
for i in df_area_interest['area_of_interest']:
    if i == 'machine learning':
        machine_learning +=1
print('Machine Learning:' , machine_learning)

artificial_intelligence = 0
for i in df_area_interest['area_of_interest']:
    if i == 'artificial intelligence':
        artificial_intelligence +=1
    if i == 'ai':
        artificial_intelligence +=1
print('Artificial Intelligence:' , artificial_intelligence)

**Task 3**

Which Country hosted majority of our teachers?

In [None]:
countries = pd.DataFrame(df['country'])

In [None]:
# Replacing "Macau" with "China" and "Urbana" with "USA"
countries['country'] = countries['country'].str.strip()
countries.loc[countries['country'] == "Urbana" , "country"] = "USA"
countries.loc[countries['country'] == "Macau" , "country"] = "China"


In [None]:
# Drop Null Value
countries.dropna(inplace=True)

In [None]:
# Country Value Counts
task_country = pd.DataFrame(countries.country.value_counts())

In [None]:
task_country = task_country.reset_index()

In [None]:
# changing the columns names
task_country = task_country.rename(columns = {
    "index" : 'country',
    'country' : 'counts'
})

In [None]:
# Here we plotting a bar plot which show the top 15 countires hosted in Pakistan
plt.figure(figsize=(15,8))
sns.barplot(x=task_country.loc[0:15,'country'], y=task_country.loc[0:15,'counts'])
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('Country', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Which Country Hosted Most Pakistani Phd Holders', fontsize=20)
plt.show()

Which university hosted majority of our teachers?

In [None]:
universities = pd.DataFrame(df['graduated_from'])


In [None]:
# Drop All Null Values
universities.dropna(inplace=True)

In [None]:
task_university = pd.DataFrame(universities.graduated_from.value_counts())


In [None]:
# Reset Indexing
task_university = task_university.reset_index()


In [None]:
# now changing the columns names

task_university = task_university.rename(columns={
    'index': 'graduated_from',
    'graduated_from': 'count'
})


In [None]:
# Now Plotting the bar plot showing top 15 univerities which hosted  pakistani professors
plt.figure(figsize=(15, 8))
sns.barplot(x=task_university.loc[0:15, 'graduated_from'], y=task_university.loc[0:15, 'count'])
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('University', fontsize=15)
plt.ylabel('Number of Professors', fontsize=15)
plt.title('Top 15 Universities which hosted most of the pakistani professors', fontsize=20)
plt.show()


**Task 4**

4- Which research areas were most common in Pakistan?

In [None]:
# plotting top 15 research area
plt.figure(figsize=(15,8))
sns.barplot(x=area_of_interest.loc[0:14,'area_of_interest'],y=area_of_interest.loc[0:14,'count'])
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('Area of Interests', fontsize=15)
plt.ylabel('Counts', fontsize=15)
plt.title('Top Area of Interest',fontsize=20)
plt.show()

**In this plot we can see 'Software Engineering'and 'Machine learning' are the top areas of interest**

**Task 5**

How does Pakistan Student to PhD Professor Ratio compare against rest of the world, especially with USA, India and China?

In [None]:
phd = pd.DataFrame(df['degree'])


In [None]:
# Converting all the "phd" written in different formats in "Phd" format
phd.loc[phd['degree'] == 'PhD',           'degree'] = 'Phd'
phd.loc[phd['degree'] == 'Ph.D(Scholar)', 'degree'] = 'Phd'
phd.loc[phd['degree'] == 'Ph.D (Scholar)','degree'] = 'Phd'
phd.loc[phd['degree'] == 'Ph.D',          'degree'] = 'Phd'

In [None]:
phd.degree.value_counts().head()