# IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# READING THE DATASET

In [None]:
dataset = pd.read_csv('../input/forbes-americas-top-colleges-2019/ForbesAmericasTopColleges2019.csv')

In [None]:
dataset.info()

In [None]:
#TOP 5
dataset.head()

In [None]:
#BOTTOM 5
dataset.tail()

In [None]:
# NO OF UNIQUE ELEMENTS IN EACH COLUMN
dataset.nunique()

In [None]:
# dropping columns that are not relevant
dataset= dataset.drop(['City','State','SAT Lower','SAT Upper','ACT Lower','ACT Upper','Website'],axis=1)

In [None]:
dataset.info()

In [None]:
#getting count of missing data
dataset.isnull().sum()


In [None]:
# taking care of missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(dataset.iloc[:, 5:10])
dataset.iloc[:, 5:10] = imputer.transform(dataset.iloc[:, 5:10])

In [None]:
dataset.isnull().sum()


# NO OF  PRIVATE AND PUBLIC COLLEGES IN TOP 100

In [None]:
top_100 = dataset.iloc[0:100,:]
public_data = top_100[top_100['Public/Private']=='Public']
public_data.info()

In [None]:
top_100 = dataset.iloc[0:100,:]
public_data = top_100[top_100['Public/Private']=='Private']
public_data.info()

## CONCLUSION - IN TOP 100 UNIVERSITIES ONLY 25 ARE PUBLIC AND 75 ARE PRIVATE

# HEATMAP


In [None]:
import seaborn as sns

#ploting the heatmap for correlation
ax = sns.heatmap(dataset.corr(), annot=True)

# RANK VS ACCEPTANCE RATE

In [None]:
public_data = dataset[dataset['Public/Private']=='Public']
plt.scatter(public_data['Rank'],public_data['Acceptance Rate'],color='red')
private_data = dataset[dataset['Public/Private']=='Private']
plt.scatter(private_data['Rank'],private_data['Acceptance Rate'],color='blue')
plt.xlabel('Rank')
plt.ylabel('Acceptance Rate')
# conclusion -> Better ranked colleges have lower acceptance rate

# Rank vs total cost

In [None]:
public_data = dataset[dataset['Public/Private']=='Public']
plt.scatter(public_data['Rank'],public_data['Total Annual Cost'],color='red')
private_data = dataset[dataset['Public/Private']=='Private']
plt.scatter(private_data['Rank'],private_data['Total Annual Cost'],color='blue')
plt.xlabel('Rank')
plt.ylabel('Total Annual Cost')

# CONCLUSION -> for same ranking public colleges have lower fees and 
#               for better rank cost is generally high

In [None]:
from numpy import cov
covariance = cov(dataset['Rank'],dataset['Total Annual Cost'])
print(covariance)
# negative covariance

# RANK VS STUDENT POPULATION

In [None]:
plt.scatter(public_data['Rank'],public_data['Student Population'],color='red')
plt.scatter(private_data['Rank'],private_data['Student Population'],color='blue')
plt.xlabel('Rank')
plt.ylabel('Student Poulation')
# conclusion -> Student population is higher in better rank colleges
#               However for same rank public colleges have much higher population

In [None]:
from numpy import cov
covariance = cov(dataset['Rank'],dataset['Student Population'])
print(covariance)
# negative covariance

# RANK VS ALUMNI SALARY

In [None]:
plt.scatter(public_data['Rank'],public_data['Alumni Salary'],color='red')
plt.scatter(private_data['Rank'],private_data['Alumni Salary'],color='blue')
plt.xlabel('Rank')
plt.ylabel('Alumni Salary')
# conclusion colleges having better rank have higher alumni salary
# public and private colleges have similar salary for same rank

In [None]:
from numpy import cov
covariance = cov(dataset['Rank'],dataset['Alumni Salary'])
print(covariance)
# negative covariance

# ANNUAL EXPECTED SALARY AND TOTAL FEES RATIO VERSUS RANK 

In [None]:
plt.scatter(dataset['Rank'],dataset['Alumni Salary']/dataset['Total Annual Cost'],color='red')
# plt.scatter(private_data['Rank'],private_data['Alumni Salary'],color='blue')
plt.xlabel('Rank')
plt.ylabel('Income Investment ratio')
plt.axis([0, 700, 0, 6]) # ignoring outliers