In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('shanghaiData.csv')
data = data.rename(columns={'world_rank':'ranking dunia', 'university_name':'universitas', 'national_rank':'ranking nasional', 'total_score':'total skor', 'award':'penghargaan', 'year':'tahun'})
data.head()

Unnamed: 0,ranking dunia,universitas,ranking nasional,total skor,alumni,penghargaan,hici,ns,pub,pcp,tahun
0,1,Harvard University,1,100.0,100.0,100.0,100.0,100.0,100.0,72.4,2005
1,2,University of Cambridge,1,73.6,99.8,93.4,53.3,56.6,70.9,66.9,2005
2,3,Stanford University,2,73.4,41.1,72.2,88.5,70.9,72.3,65.0,2005
3,4,"University of California, Berkeley",3,72.8,71.8,76.0,69.4,73.9,72.2,52.7,2005
4,5,Massachusetts Institute of Technology (MIT),4,70.1,74.0,80.6,66.7,65.8,64.3,53.0,2005


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4897 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ranking dunia     4897 non-null   object 
 1   universitas       4896 non-null   object 
 2   ranking nasional  4896 non-null   object 
 3   total skor        1101 non-null   float64
 4   alumni            4896 non-null   float64
 5   penghargaan       4895 non-null   float64
 6   hici              4895 non-null   float64
 7   ns                4875 non-null   float64
 8   pub               4895 non-null   float64
 9   pcp               4895 non-null   float64
 10  tahun             4897 non-null   int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 421.0+ KB


#### Membagi Dataset Menjadi Training Test & Testing Test

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
x = data[['alumni']]
y = data[['penghargaan']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)

In [6]:
print("Dimensi x_train : ", x_train.shape)
print("Dimensi x_train : ", x_test.shape)
print("Dimensi y_train : ", y_train.shape)
print("Dimensi y_test  : ", y_test.shape)

Dimensi x_train :  (3427, 1)
Dimensi x_train :  (1470, 1)
Dimensi y_train :  (3427, 1)
Dimensi y_test  :  (1470, 1)


#### Normalisasi Data

In [7]:
data_normalisasi = data.copy()
data_normalisasi['hici'] = (data_normalisasi['hici'] - data_normalisasi['hici'].min()) / (data_normalisasi['hici'].max() - data_normalisasi['hici'].min())
data_normalisasi['ns']   = (data_normalisasi['ns'] - data_normalisasi['ns'].min()) / (data_normalisasi['ns'].max() - data_normalisasi['ns'].min())
print(data_normalisasi['hici'])
print(data_normalisasi['ns'])

0       1.000
1       0.533
2       0.885
3       0.694
4       0.667
        ...  
4892    0.050
4893    0.076
4894    0.036
4895    0.000
4896    0.149
Name: hici, Length: 4897, dtype: float64
0       1.000
1       0.566
2       0.709
3       0.739
4       0.658
        ...  
4892    0.109
4893    0.051
4894    0.108
4895    0.122
4896    0.075
Name: ns, Length: 4897, dtype: float64


#### Standarisasi Data

In [8]:
data_standarisasi = data.copy()
data_standarisasi.drop('ranking dunia', axis = 1, inplace= True)
data_standarisasi.drop('universitas', axis = 1, inplace= True)
data_standarisasi.drop('ranking nasional', axis = 1, inplace= True)

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()
scale_data = scaler.fit_transform(data_standarisasi)
print("Nilai data sebelum standarisasi : ")
print(data[0:4])
print("Nilai standar deviasi : ", np.std(data))
print("Nilai data setelah standarisasi : ")
print(scale_data)
print("Nilai standar deviasi : ", np.std(scale_data))

Nilai data sebelum standarisasi : 
  ranking dunia                         universitas ranking nasional  \
0             1                  Harvard University                1   
1             2             University of Cambridge                1   
2             3                 Stanford University                2   
3             4  University of California, Berkeley                3   

   total skor  alumni  penghargaan   hici     ns    pub   pcp  tahun  
0       100.0   100.0        100.0  100.0  100.0  100.0  72.4   2005  
1        73.6    99.8         93.4   53.3   56.6   70.9  66.9   2005  
2        73.4    41.1         72.2   88.5   70.9   72.3  65.0   2005  
3        72.8    71.8         76.0   69.4   73.9   72.2  52.7   2005  
Nilai standar deviasi :  total skor     13.551028
alumni         14.139192
penghargaan    15.492527
hici           14.381240
ns             12.510246
pub            13.049475
pcp             9.253406
tahun           3.197250
dtype: float64
Nilai dat

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


#### Data Cleaning Nilai NULL

In [11]:
data.isna().sum()

ranking dunia          0
universitas            1
ranking nasional       1
total skor          3796
alumni                 1
penghargaan            2
hici                   2
ns                    22
pub                    2
pcp                    2
tahun                  0
dtype: int64

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
#Mengganti Nilai NULL menggunakan nilai MEAN
imputer1 = SimpleImputer(strategy="mean")
data['total skor'] = imputer1.fit_transform(data[['total skor']])
data['alumni'] = imputer1.fit_transform(data[['alumni']])
data['penghargaan'] = imputer1.fit_transform(data[['alumni']])
data['hici'] = imputer1.fit_transform(data[['hici']])
data['ns'] = imputer1.fit_transform(data[['ns']])
data['pub'] = imputer1.fit_transform(data[['pub']])
data['pcp'] = imputer1.fit_transform(data[['pcp']])

In [14]:
data.isna().sum()

ranking dunia       0
universitas         1
ranking nasional    1
total skor          0
alumni              0
penghargaan         0
hici                0
ns                  0
pub                 0
pcp                 0
tahun               0
dtype: int64

#### Data Cleaning Nilai Duplikat

In [15]:
data.duplicated().sum()

0

In [16]:
df = pd.read_csv('Copy.csv')
df = df.rename(columns={'world_rank':'ranking dunia', 'university_name':'universitas', 'national_rank':'ranking nasional', 'total_score':'total skor', 'award':'penghargaan', 'year':'tahun'})
df.head()

Unnamed: 0,ranking dunia,universitas,ranking nasional,total skor,alumni,penghargaan,hici,ns,pub,pcp,tahun
0,1,Harvard University,1.0,100.0,100.0,100.0,100.0,100.0,100.0,72.4,2005.0
1,2,University of Cambridge,1.0,73.6,99.8,93.4,53.3,56.6,70.9,66.9,2005.0
2,3,Stanford University,2.0,73.4,41.1,72.2,88.5,70.9,72.3,65.0,2005.0
3,"4,""University of California, Berkeley"",3,72.8,...",,,,,,,,,,
4,5,Massachusetts Institute of Technology (MIT),4.0,70.1,74.0,80.6,66.7,65.8,64.3,53.0,2005.0


In [17]:
df[df.duplicated()]

Unnamed: 0,ranking dunia,universitas,ranking nasional,total skor,alumni,penghargaan,hici,ns,pub,pcp,tahun
4897,401-500,University of Zaragoza,9-13,,0.0,0.0,7.6,5.1,33.3,13.1,2015.0
4898,401-500,Utah State University,126-146,,13.6,0.0,3.6,10.8,25.1,15.5,2015.0
4899,401-500,Vienna University of Technology,4-6,,0.0,0.0,0.0,12.2,28.8,22.9,2015.0
4900,401-500,Wake Forest University,126-146,,0.0,0.0,14.9,7.5,25.0,11.9,2015.0


In [18]:
df.drop_duplicates(inplace=True)

In [19]:
df.duplicated().sum()

0

#### Mengganti Tipe Data

In [20]:
data.isna().sum()

ranking dunia       0
universitas         1
ranking nasional    1
total skor          0
alumni              0
penghargaan         0
hici                0
ns                  0
pub                 0
pcp                 0
tahun               0
dtype: int64

In [22]:
data['total skor'] = data['total skor'].astype('int')

#### One Hot Encoding

In [38]:
from sklearn.preprocessing import OneHotEncoder

y = OneHotEncoder(sparse=False)
y = y.fit_transform(data[['pub']])
data_y = pd.DataFrame(y)

In [39]:
y

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])