# **Data Preproceeing Tools**

Mount the Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Importing the **Libraries**

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive

Importing the **Datasets**

In [None]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Colab Notebooks/MachineLearning/Exp01/college_student_placement_dataset_with_missing.csv'
dataset = pd.read_csv(file_path)
dataset.head()
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values # Corrected to select only the 'Placement' column

In [5]:
print(dataset.isnull().mean()*100)

College_ID                  0.00
IQ                          0.04
Prev_Sem_Result             0.05
CGPA                        0.07
Academic_Performance        0.02
Internship_Experience       0.00
Extra_Curricular_Score      0.00
Communication_Skills        0.00
Projects_Completed          0.00
Placement                   0.00
ssc_p                     100.00
hsc_p                     100.00
degree_p                  100.00
dtype: float64


Taking care of **Missing** **Data**

In [18]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)
imputer.fit(x[:,1:9])
x[:,1:9] = imputer.transform(x[:,1:9])

In [21]:
print(x[:,1:9])

[[107.0 6.61 6.28 ... 8 8 4]
 [97.0 5.52 5.37 ... 7 8 0]
 [109.0 5.36 5.83 ... 3 1 1]
 ...
 [89.0 6.08 6.25 ... 3 9 5]
 [107.0 8.77 8.92 ... 7 5 1]
 [109.0 9.41 9.77 ... 3 5 5]]


Encoding Categorial Data

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
# Corrected indices for one-hot encoding within x[:, 1:9]
# 'College_ID' (index 0 in x[:, 1:9])
# 'Internship_Experience' (index 4 in x[:, 1:9])
# 'Extra_Curricular_Score' (index 5 in x[:, 1:9])
# 'Communication_Skills' (index 6 in x[:, 1:9])
# 'Projects_Completed' (index 7 in x[:, 1:9])
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(sparse_output=False),[0, 4, 5, 6, 7])],remainder='passthrough')
z = np.array(ct.fit_transform(x[:,1:9]))
for row in z:
  print(row)

ValueError: all features must be in [0, 7] or [-8, 0]

Encoding Independent Variable

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(y_encoded.tolist())
print(y.tolist())


[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 

Spliting data into **Training Data And Testing Data**

In [10]:
from sklearn.model_selection import train_test_split
# Split the encoded data (z) into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(z,y_encoded,test_size=0.2,random_state = 1)
print(x_train)

[[0.0 0.0 0.0 ... 4 2 'No']
 [0.0 0.0 0.0 ... 4 5 'No']
 [0.0 0.0 0.0 ... 10 2 'Yes']
 ...
 [0.0 0.0 0.0 ... 8 0 'No']
 [0.0 0.0 0.0 ... 1 5 'No']
 [0.0 0.0 0.0 ... 1 2 'No']]


In [11]:
print(x_train)

[[0.0 0.0 0.0 ... 4 2 'No']
 [0.0 0.0 0.0 ... 4 5 'No']
 [0.0 0.0 0.0 ... 10 2 'Yes']
 ...
 [0.0 0.0 0.0 ... 8 0 'No']
 [0.0 0.0 0.0 ... 1 5 'No']
 [0.0 0.0 0.0 ... 1 2 'No']]


In [12]:
print(y_train)

[0 0 1 ... 0 0 0]


Feature **Scaling**

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train[:, 99:] = sc.fit_transform(x_train[:, 99:])
x_test[:, 99:] = sc.transform(x_test[:, 99:])
print(x_train)
print(x_test)

[[-0.10786004 -0.09462807 -0.09986735 ... -1.26784561 -1.32271584
  -0.89311538]
 [-0.10786004 -0.09462807 -0.09986735 ... -0.76226919 -0.88694124
  -1.5908277 ]
 [-0.10786004 -0.09462807 -0.09986735 ... -0.44799195 -0.34389906
  -1.24197154]
 ...
 [-0.10786004 -0.09462807 -0.09986735 ...  0.06441659  0.03153751
  -0.89311538]
 [-0.10786004 -0.09462807 -0.09986735 ...  1.48549626  1.23829792
  -1.24197154]
 [-0.10786004 -0.09462807 -0.09986735 ...  1.16438691  1.45953733
   1.54887774]]
[[-0.10786004 -0.09462807 -0.09986735 ...  1.47866415  1.66736828
   1.20002158]
 [-0.10786004 -0.09462807 -0.09986735 ...  0.72713163  0.99024161
  -0.19540306]
 [-0.10786004 -0.09462807 -0.09986735 ...  0.81594911  0.56117124
   1.54887774]
 ...
 [-0.10786004 -0.09462807 -0.09986735 ... -1.17902813 -1.00761729
   0.85116542]
 [-0.10786004 -0.09462807 -0.09986735 ... -0.85108667 -0.75285675
  -1.24197154]
 [-0.10786004 -0.09462807 -0.09986735 ... -0.68028382 -0.73944831
   1.20002158]]


In [16]:
# Inspect the data types of columns in x_train
print(x_train.dtype)

# Print the first row of x_train to see the values
print(x_train[0, :])

object
[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 105.0 5.68 5.56 3.0 'No' 6 4 2 'No']
