# Data Transformations in Pandas
#### by Sukritha Joshi

In [1]:
# importing required libraries
import numpy as np
import pandas as pd

In [2]:
# creating a students datframe
df_students = pd.DataFrame(['Sarah', 'Daniel', 'Smitha', 'Kiran', 'Hailey'], columns=['student names'])
df_students

Unnamed: 0,student names
0,Sarah
1,Daniel
2,Smitha
3,Kiran
4,Hailey


In [3]:
# creating a table with random values for scores of students
df_scores = (pd.DataFrame(np.random.uniform(low=0, high=10.1, size=(5,3)), columns=['Test1', 'Test2', 'Test3'])).round(2)
df_scores

Unnamed: 0,Test1,Test2,Test3
0,5.52,9.39,3.1
1,7.43,7.97,3.32
2,4.14,5.91,3.84
3,2.9,7.0,1.84
4,2.61,0.27,9.43


In [4]:
# combining the 2 dataframes to form a final dataset
df = pd.concat([df_students, df_scores], axis=1)
df

Unnamed: 0,student names,Test1,Test2,Test3
0,Sarah,5.52,9.39,3.1
1,Daniel,7.43,7.97,3.32
2,Smitha,4.14,5.91,3.84
3,Kiran,2.9,7.0,1.84
4,Hailey,2.61,0.27,9.43


In [5]:
# starting the index numbers from 1 instead of 0
df.index = np.arange(1, len(df)+1)

In [6]:
df

Unnamed: 0,student names,Test1,Test2,Test3
1,Sarah,5.52,9.39,3.1
2,Daniel,7.43,7.97,3.32
3,Smitha,4.14,5.91,3.84
4,Kiran,2.9,7.0,1.84
5,Hailey,2.61,0.27,9.43


In [7]:
# renaming the name of the column
df = df.rename({'student names':'StudentName'}, axis=1)
df

Unnamed: 0,StudentName,Test1,Test2,Test3
1,Sarah,5.52,9.39,3.1
2,Daniel,7.43,7.97,3.32
3,Smitha,4.14,5.91,3.84
4,Kiran,2.9,7.0,1.84
5,Hailey,2.61,0.27,9.43


In [8]:
# the school has 3 mid terms, the average of the highest 2 exams is considered as the final grade
# this is a function to calculte the final score of each student

def best_of_3(a, b, c): 
    return max((a+b)/2, (b+c)/2, (c+a)/2)

df['FinalScore'] = df.apply(lambda row : best_of_3(row['Test1'], row['Test2'], row['Test3']), axis = 1)
df['FinalScore'] = df['FinalScore'].round(2)

In [13]:
# viewing the dataframe after adding the new column
df

Unnamed: 0,StudentName,Test1,Test2,Test3,FinalScore
1,Sarah,5.52,9.39,3.1,7.46
2,Daniel,7.43,7.97,3.32,7.7
3,Smitha,4.14,5.91,3.84,5.03
4,Kiran,2.9,7.0,1.84,4.95
5,Hailey,2.61,0.27,9.43,6.02


In [14]:
# if a student has scored below 4, he/she has failed and will have to re-attempt the class.
# the below dataframe highlights the scores below 4

df.style.applymap(lambda x: "background-color: red" if x<4 else "background-color: ", subset=df.columns[1:5])

Unnamed: 0,StudentName,Test1,Test2,Test3,FinalScore
1,Sarah,5.52,9.39,3.1,7.46
2,Daniel,7.43,7.97,3.32,7.7
3,Smitha,4.14,5.91,3.84,5.03
4,Kiran,2.9,7.0,1.84,4.95
5,Hailey,2.61,0.27,9.43,6.02


In [None]:
# finally let's add a column which explicitly states whether or not a student has passed

def result(FinalScore):
    if FinalScore < 4:
        return 'fail'
    else:
        return 'pass'

df['Result'] = df.apply(lambda row : result(row['FinalScore']), axis = 1)

In [None]:
df

In [None]:
df.style.applymap(lambda x: "background-color: red" if x=='fail' else "background-color: lime", subset=['Result'])

In [None]:
df.style.applymap(lambda x: "background-color: red" if x=='fail' else "background-color: lime", subset=['Result']).to_excel('Result.xlsx')

### data set 2

In [19]:
df2 = pd.read_csv('student-mat.csv', sep=';')

In [20]:
df2.head()

Unnamed: 0,school,sex,age,address,health,absences,G1,G2,G3
0,GP,F,18,U,3.0,6.0,5,6,6
1,GP,F,17,U,3.0,4.0,5,5,6
2,GP,F,15,U,3.0,10.0,7,8,10
3,GP,F,15,U,5.0,2.0,15,14,15
4,GP,F,16,U,5.0,4.0,6,10,10


In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   school    395 non-null    object 
 1   sex       395 non-null    object 
 2   age       395 non-null    int64  
 3   address   378 non-null    object 
 4   health    384 non-null    float64
 5   absences  380 non-null    float64
 6   G1        395 non-null    int64  
 7   G2        395 non-null    int64  
 8   G3        395 non-null    int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 27.9+ KB


In [22]:
df2.isnull().sum()

school       0
sex          0
age          0
address     17
health      11
absences    15
G1           0
G2           0
G3           0
dtype: int64

In [26]:
# finding the number of missing records and its percentage

total = df2.isnull().sum().sort_values(ascending=False)
percent = (round((df2.isnull().sum()/df2.isnull().count()*100),1))
missing_data = pd.concat([total, percent], axis=1, keys=['Total', '%'])
missing_data

Unnamed: 0,Total,%
address,17,4.3
absences,15,3.8
health,11,2.8
G3,0,0.0
G2,0,0.0
G1,0,0.0
age,0,0.0
sex,0,0.0
school,0,0.0


In [27]:
# dropping the null records from health and absences columns since it is a small percentage
df2.dropna(subset=['health', 'absences'], inplace=True)

In [28]:
df2.isnull().sum()

school       0
sex          0
age          0
address     15
health       0
absences     0
G1           0
G2           0
G3           0
dtype: int64

In [29]:
df2.address.value_counts()

U    276
R     79
Name: address, dtype: int64

In [30]:
# to handle missing data in address column, we can replace the Nan values with the most frequently occuring category in the column
# this method has a the disadvantage that it may imbalance the dataset

df2.fillna(value = df2['address'].value_counts().idxmax(), inplace=True)


In [31]:
df2.isnull().sum()

school      0
sex         0
age         0
address     0
health      0
absences    0
G1          0
G2          0
G3          0
dtype: int64

                                                    ---x----  end  ----x---