In [1]:
import pandas as pd

In [2]:
# reading the csv file
df_exams=pd.read_csv('StudentsPerformance.csv')
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# 1. Selecting one column

## 1.1 Syntax 1

In [3]:
# select a column with [] (preferred way to select a column)
df_exams['gender']

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

In [4]:
# check out the data type of a column 
type(df_exams['gender'])

pandas.core.series.Series

In [6]:
# series: attributes and methods 
df_exams['gender'].index

RangeIndex(start=0, stop=1000, step=1)

In [7]:
# series: attributes and methods 
df_exams['gender'].head()

0    female
1    female
2    female
3      male
4      male
Name: gender, dtype: object

## 1.2 Syntax 2


In [8]:
# select a column with .
df_exams.gender

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

In [9]:
# select a column with . (pitfalls)
df_exams.math score

SyntaxError: invalid syntax (3061062658.py, line 2)

In [10]:
# select the same column using []
df_exams['math score']

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

# 2. Selecting two or more columns 

In [12]:
# select 2 column using [[]]
df_exams[['gender' , 'math score']]

Unnamed: 0,gender,math score
0,female,72
1,female,69
2,female,90
3,male,47
4,male,76
...,...,...
995,female,88
996,male,62
997,female,59
998,female,68


In [13]:
# check out the data type of the selection
type(df_exams[['gender' , 'math score']])

pandas.core.frame.DataFrame

In [14]:
# seelct 2 or more colums using [[]]
df_exams[['gender' , 'math score' , 'reading score' , 'writing score']]

Unnamed: 0,gender,math score,reading score,writing score
0,female,72,72,74
1,female,69,90,88
2,female,90,95,93
3,male,47,57,44
4,male,76,78,75
...,...,...,...,...
995,female,88,99,95
996,male,62,55,55
997,female,59,71,65
998,female,68,78,77


In [16]:
# we can't select 2 or more columns with the "."
df_exams.'gender' , 'math score'

SyntaxError: invalid syntax (4229815214.py, line 2)

# 3. Add new column to a dataframe

# 1. Adding a new column with scalar value 

### 3.1 Adding a new column with a scalar value

In [17]:
# adding new column to dataframe
df_exams['language score'] = 70

In [18]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,70
1,female,group C,some college,standard,completed,69,90,88,70
2,female,group B,master's degree,standard,none,90,95,93,70
3,male,group A,associate's degree,free/reduced,none,47,57,44,70
4,male,group C,some college,standard,none,76,78,75,70
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,70
996,male,group C,high school,free/reduced,none,62,55,55,70
997,female,group C,high school,free/reduced,completed,59,71,65,70
998,female,group D,some college,standard,completed,68,78,77,70


### 3.2 Adding a new column with an array

In [19]:
# the dataframe has 1000 rows , so we need to create an array with 1000 elements

In [21]:
# import numpy
import numpy as np

In [26]:
# create an array of 1000 elememts
language_score = np.arange(0, 1000)

In [27]:
len(language_score)

1000

In [28]:
# adding a new column to dataframe with an array
df_exams['language_score'] = language_score

In [30]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,language_score
0,female,group B,bachelor's degree,standard,none,72,72,74,70,0
1,female,group C,some college,standard,completed,69,90,88,70,1
2,female,group B,master's degree,standard,none,90,95,93,70,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,70,3
4,male,group C,some college,standard,none,76,78,75,70,4
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,70,995
996,male,group C,high school,free/reduced,none,62,55,55,70,996
997,female,group C,high school,free/reduced,completed,59,71,65,70,997
998,female,group D,some college,standard,completed,68,78,77,70,998


In [37]:
# create random integer numbers between 1 and 100
int_language_score = np.random.randint(1,100,size=1000)

In [38]:
# min value (it is inclusive)
min(int_language_score)

np.int32(1)

In [39]:
# max value (it is exclusive)
max(int_language_score)

np.int32(99)

In [42]:
# adding a new column to dataframe with an array
df_exams['language score'] = int_language_score
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,language_score
0,female,group B,bachelor's degree,standard,none,72,72,74,59,0
1,female,group C,some college,standard,completed,69,90,88,59,1
2,female,group B,master's degree,standard,none,90,95,93,42,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,90,3
4,male,group C,some college,standard,none,76,78,75,47,4
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,67,995
996,male,group C,high school,free/reduced,none,62,55,55,95,996
997,female,group C,high school,free/reduced,completed,59,71,65,8,997
998,female,group D,some college,standard,completed,68,78,77,20,998


In [44]:
# create random float numbers between 1 and 100
np.random.uniform(1,100,size=100)

array([19.90920433, 78.31106223, 89.08885495, 53.19319614, 71.54767275,
       40.4065396 , 22.59928877, 95.16018744, 20.92157217, 93.09498424,
        8.42579286, 73.36187948, 71.17952224, 80.62947479, 65.458954  ,
       87.06015679, 56.62956211, 71.18745481,  6.74970566, 41.48853758,
       61.26285853, 12.84565685, 22.42987049, 98.91341589, 81.38085138,
       57.97316199, 47.87576552,  6.26395404, 44.48597209, 15.82142914,
       75.23242239, 43.94580718, 21.28756921, 16.90966429, 92.34863715,
       75.27666453, 34.88012872, 13.12461673, 54.03226963, 10.03460343,
       63.37485707, 34.14956414, 11.20945195, 66.64510766, 65.14224973,
       84.91557964, 98.61258199, 88.55488153, 50.19792652, 57.74120945,
       46.10072054, 14.87392633, 92.87985629, 93.30675324, 33.72769518,
       11.66564569, 35.20258532, 55.69106708, 66.86688468, 99.05052019,
        1.75296748, 95.88854519, 82.95103306, 26.034324  , 51.8490822 ,
       14.53266735, 55.95158967, 87.54868918, 26.1651613 , 90.96

# 4. Add New Column with assign() and insert()

## 1. Adding a new column

### 4.1 assign()

In [46]:
# When to use assign()?
#-> Add multiple columns in a single line of code
#-> When you need to overwrite the values of an existing columns(best practice)

#It returns a new object(a copy) with all the original columns in addition to the new one

In [47]:
import numpy as np

In [48]:
# create ramdom numbers for our new score columns
score1=np.random.randint(1,100,size=1000)
score2=np.random.randint(1,100,size=1000)

In [52]:
# create a series using the random numbers
series1=pd.Series(score1,index=np.arange(0,1000))
series2=pd.Series(score2,index=np.arange(0,1000))

In [54]:
# using assign() to add multiple columns
df_exams= df_exams.assign(score1=series1 , score2=series2)

In [55]:
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,language_score,score1,score2
0,female,group B,bachelor's degree,standard,none,72,72,74,59,0,28,80
1,female,group C,some college,standard,completed,69,90,88,59,1,16,44
2,female,group B,master's degree,standard,none,90,95,93,42,2,28,89
3,male,group A,associate's degree,free/reduced,none,47,57,44,90,3,38,52
4,male,group C,some college,standard,none,76,78,75,47,4,24,34
...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,67,995,39,41
996,male,group C,high school,free/reduced,none,62,55,55,95,996,93,80
997,female,group C,high school,free/reduced,completed,59,71,65,8,997,80,90
998,female,group D,some college,standard,completed,68,78,77,20,998,51,85


### 4.2 insert()

In [56]:
# inserts a new column at a specific position or index
# does not create a copy but changes the actual 

In [57]:
# using index() to add a column at a specific position
df_exams.insert(1,'test',series1)

In [58]:
# show dataframe
df_exams

Unnamed: 0,gender,test,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,language_score,score1,score2
0,female,28,group B,bachelor's degree,standard,none,72,72,74,59,0,28,80
1,female,16,group C,some college,standard,completed,69,90,88,59,1,16,44
2,female,28,group B,master's degree,standard,none,90,95,93,42,2,28,89
3,male,38,group A,associate's degree,free/reduced,none,47,57,44,90,3,38,52
4,male,24,group C,some college,standard,none,76,78,75,47,4,24,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,39,group E,master's degree,standard,completed,88,99,95,67,995,39,41
996,male,93,group C,high school,free/reduced,none,62,55,55,95,996,93,80
997,female,80,group C,high school,free/reduced,completed,59,71,65,8,997,80,90
998,female,51,group D,some college,standard,completed,68,78,77,20,998,51,85
