# Machine learning steps:

### 1. Gather data
### 2. Preprocess data
### 3. Choose model
### 4. train model
### 5. test the model
### 6. Hypertuning
### 7. Test against real world data

# Pandas

In [8]:
import pandas as pd
import numpy as np

In [None]:
# numpy 
"""
2 - scalar
[1, 2] - vector
[[1, 2], [3, 4]] - matrix
"""
# pandas
"""
1d - series - homogenous array, size-immutable
2d - Dataframe - heterogenously typed columns, size-mutable
"""

#### Series - array like
#### Dataframe - tabular

## Series

In [11]:
# pandas.Series(data, index, dtype)

In [17]:
series = pd.Series([1, 2, 3, 4, 5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [19]:
series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print(series)

a    1
b    2
c    3
d    4
e    5
dtype: int64


## Creating series using numpy

In [26]:
series1 = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) ##randn will take -ve numbers as well, -1 to 1 numbers
print(series1)

a   -1.043814
b    0.937861
c   -1.119614
d   -0.398839
e    0.092144
dtype: float64


## Creating series using dictionary 

In [29]:
dictionary = {'a':1, 'b':2, 'c':3}
d1 = pd.Series(dictionary)
d1

a    1
b    2
c    3
dtype: int64

#### [] - series
#### [[]] - dataframe

In [34]:
# d1.iloc[[r, c]] - index based location fetching
# d1.loc[[r, c]] - row and column based location fetching

In [36]:
d1[[1, 2]]

  d1[[1, 2]]


b    2
c    3
dtype: int64

In [38]:
d1.iloc[[1, 2]]

b    2
c    3
dtype: int64

In [42]:
series + series1 #index type must match for addition operation else gives wrong result

a   -0.043814
b    2.937861
c    1.880386
d    3.601161
e    5.092144
dtype: float64

In [44]:
series2 = pd.Series(np.random.randn(5))
series2

0    0.111623
1    1.482713
2   -1.337679
3    0.005507
4   -0.170813
dtype: float64

In [46]:
series + series2

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64

# Dataframe

In [49]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [51]:
df = pd.DataFrame(columns=['c1', 'c2', 'c3'])
print(df)

Empty DataFrame
Columns: [c1, c2, c3]
Index: []


In [69]:
df = pd.DataFrame(columns=['c1', 'c2', 'c3', 'c4'], index=range(1, 6)) #index means row count
df

Unnamed: 0,c1,c2,c3,c4
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,


In [71]:
dictionary = {'a':1, 'b':2, 'c':3}
df1 = pd.DataFrame([dictionary]) #keys will be taken as columns and values as their value at row number 1
df1

Unnamed: 0,a,b,c
0,1,2,3


In [73]:
print(df1.index)
print(df.index)

RangeIndex(start=0, stop=1, step=1)
RangeIndex(start=1, stop=6, step=1)


In [75]:
print(df1.columns)
print(df.columns)

Index(['a', 'b', 'c'], dtype='object')
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')


In [77]:
mylist = [
    ['Apple', 'Red'],
    ['Banana', 'Yellow'],
    ['Orange', 'orange']
]
mydata = pd.DataFrame(mylist)
mydata

Unnamed: 0,0,1
0,Apple,Red
1,Banana,Yellow
2,Orange,orange


In [79]:
mydata = pd.DataFrame(mylist, columns=['Fruit name', 'color'])
mydata

Unnamed: 0,Fruit name,color
0,Apple,Red
1,Banana,Yellow
2,Orange,orange


### Dataframe using numpy array

In [82]:
mylist1 = np.array([
    [0, 1],
    [2, 3],
    [3, 4]
])
mydf = pd.DataFrame(mylist1, columns=['Even', 'Odd'])
mydf

Unnamed: 0,Even,Odd
0,0,1
1,2,3
2,3,4


### Load a csv data using pandas

In [112]:
df = pd.read_csv('./csv files/cereals.csv')
df

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [114]:
# set one of the columns as index in dataframe
df.set_index('name') #it will return a new dataframe

# df.set_index('name', inplace=True) #will modify the original array

Unnamed: 0_level_0,calories,protein,vitamins,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100% Bran,70,4,25,68.402973
100% Natural Bran,120,3,0,33.983679
All-Bran,70,4,25,59.425505
All-Bran with Extra Fiber,50,4,25,93.704912
Almond Delight,110,2,25,34.384843
Apple Cinnamon Cheerios,110,2,25,29.509541
Apple Jacks,110,2,25,33.174094
Basic 4,130,3,25,37.038562
Bran Chex,90,2,25,49.120253
Bran Flakes,90,3,25,53.313813


### Examining the dataframe

In [116]:
df.head() #returns first 5 rows

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843


In [118]:
df.tail() #return last 5 rows

Unnamed: 0,name,calories,protein,vitamins,rating
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [120]:
df.head(10)

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [122]:
df.describe()

Unnamed: 0,calories,protein,vitamins,rating
count,10.0,10.0,10.0,10.0
mean,95.0,2.9,22.5,49.205817
std,25.495098,0.875595,7.905694,20.315297
min,50.0,2.0,0.0,29.509541
25%,75.0,2.0,25.0,34.08397
50%,100.0,3.0,25.0,43.079408
75%,110.0,3.75,25.0,57.897582
max,130.0,4.0,25.0,93.704912


### Slicing rows using bracket operators

In [125]:
df[1:4]

Unnamed: 0,name,calories,protein,vitamins,rating
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912


In [133]:
df['calories':'rating'] #cannot do slice indexing on Index with these indexers [calories] of type str

TypeError: cannot do slice indexing on Index with these indexers [calories] of type str

In [139]:
df['calories','rating'] #in single dimension we can't call multiple column names

KeyError: ('calories', 'rating')

In [137]:
df[['calories', 'rating']]

Unnamed: 0,calories,rating
0,70,68.402973
1,120,33.983679
2,70,59.425505
3,50,93.704912
4,110,34.384843
5,110,29.509541
6,110,33.174094
7,130,37.038562
8,90,49.120253
9,90,53.313813


### Boolean filter

In [144]:
df1 = df[0:5]
bool_list = [True, False, True, True, False] 
df1[bool_list] #displays the rows where there is a True

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912


In [146]:
df['calories'] > 70 #returns boolean output

0    False
1     True
2    False
3    False
4     True
5     True
6     True
7     True
8     True
9     True
Name: calories, dtype: bool

In [148]:
condn = df['calories'] > 70
df[condn]

Unnamed: 0,name,calories,protein,vitamins,rating
1,100% Natural Bran,120,3,0,33.983679
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


### filtering rows using &(and) and |(or)

In [155]:
df[ (df['calories']>70) & (df['protein']<3) ]

Unnamed: 0,name,calories,protein,vitamins,rating
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
8,Bran Chex,90,2,25,49.120253


In [159]:
df[ (df['calories']>70) | (df['protein']<3) ]

Unnamed: 0,name,calories,protein,vitamins,rating
1,100% Natural Bran,120,3,0,33.983679
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


### Filtering data using loc()

In [164]:
df.loc[2, 'protein'] #returns in series format

4

In [168]:
df.loc[[2], ['protein']]

Unnamed: 0,protein
2,4


In [172]:
df.loc[0:4, ['protein', 'name']]

Unnamed: 0,protein,name
0,4,100% Bran
1,3,100% Natural Bran
2,4,All-Bran
3,4,All-Bran with Extra Fiber
4,2,Almond Delight


In [176]:
# more than one column will automatically be considered as dataframe, so no need to specify in []
df.loc[0:4, 'name':'protein']

Unnamed: 0,name,calories,protein
0,100% Bran,70,4
1,100% Natural Bran,120,3
2,All-Bran,70,4
3,All-Bran with Extra Fiber,50,4
4,Almond Delight,110,2


### Filtering data using iloc()
iloc will allow us to perform slicing and indexing using only numbers, so that we need not remember the column names

In [180]:
df.iloc[0:5, 0:3] #slicing using iloc

Unnamed: 0,name,calories,protein
0,100% Bran,70,4
1,100% Natural Bran,120,3
2,All-Bran,70,4
3,All-Bran with Extra Fiber,50,4
4,Almond Delight,110,2


In [182]:
df.iloc[[0, 2, 5], 0:3] #indexing and slicing using iloc

Unnamed: 0,name,calories,protein
0,100% Bran,70,4
2,All-Bran,70,4
5,Apple Cinnamon Cheerios,110,2


### Adding and deleting rows and columns

In [186]:
# it will update the original dataframe
df1.loc[4] = ['Priyanka', 110, 1, 25, 27.75]
df1

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75


In [190]:
df #df will also get updated

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


#### Changes made in df1 will also affect df dataframe becuase df got shallow copied 
##### to avoid this we have to go for .copy method which performs a deep copy of an object

### Deep copy of Dataframe

In [236]:
df1 = df.iloc[0:5, ].copy()

In [238]:
df1.drop(3) #drop- to delete a particular row or column but won't modify df1

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
4,Priyanka,110,1,25,27.75


In [240]:
df1.drop(3, axis=0) #axis=0 is row and axis=1 is column

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
4,Priyanka,110,1,25,27.75


In [244]:
df1.drop('rating', axis=1) #if having coln names then specify colmn name instead of index

Unnamed: 0,name,calories,protein,vitamins
0,100% Bran,70,4,25
1,100% Natural Bran,120,3,0
2,All-Bran,70,4,25
3,All-Bran with Extra Fiber,50,4,25
4,Priyanka,110,1,25


In [246]:
df1.drop(3, axis=0, inplace=True) #will modify original array

In [248]:
df1

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
4,Priyanka,110,1,25,27.75


In [232]:
df

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


### Adding new column

In [311]:
df1['My new column'] = [1, 2, 3, 4] #4 values for 4 rows
df1

Unnamed: 0,name,calories,protein,vitamins,rating,My new column
0,100% Bran,70,4,25,68.402973,1
1,100% Natural Bran,120,3,0,33.983679,2
2,All-Bran,70,4,25,59.425505,3
3,Priyanka,110,1,25,27.75,4


### Sorting values

In [261]:
# returns a modified one
asc_df = df.sort_values(by='calories') #sort by specific column in ascending order
asc_df

Unnamed: 0,name,calories,protein,vitamins,rating
3,All-Bran with Extra Fiber,50,4,25,93.704912
0,100% Bran,70,4,25,68.402973
2,All-Bran,70,4,25,59.425505
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
1,100% Natural Bran,120,3,0,33.983679
7,Basic 4,130,3,25,37.038562


In [265]:
desc_df = df.sort_values(by='calories', ascending=False)
desc_df

Unnamed: 0,name,calories,protein,vitamins,rating
7,Basic 4,130,3,25,37.038562
1,100% Natural Bran,120,3,0,33.983679
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813
0,100% Bran,70,4,25,68.402973
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912


### Changing the column index

In [268]:
df.columns

Index(['name', 'calories', 'protein', 'vitamins', 'rating'], dtype='object')

In [270]:
df.columns = ['col1', 'col2', 'col3', 'col4', 'col5']

In [272]:
df

Unnamed: 0,col1,col2,col3,col4,col5
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [274]:
df.columns = ['name', 'calories', 'protein', 'vitamins', 'rating']

In [276]:
df

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [290]:
df.to_csv('./csv files/myfile.csv') #it will create the file automatically

In [292]:
newdf = pd.read_csv('./csv files/myfile.csv')
newdf

Unnamed: 0.1,Unnamed: 0,name,calories,protein,vitamins,rating
0,0,100% Bran,70,4,25,68.402973
1,1,100% Natural Bran,120,3,0,33.983679
2,2,All-Bran,70,4,25,59.425505
3,3,All-Bran with Extra Fiber,50,4,25,93.704912
4,4,Priyanka,110,1,25,27.75
5,5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,6,Apple Jacks,110,2,25,33.174094
7,7,Basic 4,130,3,25,37.038562
8,8,Bran Chex,90,2,25,49.120253
9,9,Bran Flakes,90,3,25,53.313813


In [294]:
df.to_csv('./csv files/myfile.csv', index_label=False) #it will create the file automatically
# index_label = False means - it will not take the row index column of dataframe in the csv file

In [296]:
newdf = pd.read_csv('./csv files/myfile.csv')
newdf

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


### Concatinating the Dataframes

In [299]:
df2 = df[5:8].copy()
df2

Unnamed: 0,name,calories,protein,vitamins,rating
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562


In [303]:
df1 = df1.reset_index(drop=True) #it will reset the index values of rows from 0 to n-1
df1

Unnamed: 0,name,calories,protein,vitamins,rating,My new column
0,100% Bran,70,4,25,68.402973,1
1,100% Natural Bran,120,3,0,33.983679,2
2,All-Bran,70,4,25,59.425505,3
3,Priyanka,110,1,25,27.75,4


In [305]:
df2 = df2.reset_index(drop=True) #it will reset the index values of rows from 0 to n-1
df2

Unnamed: 0,name,calories,protein,vitamins,rating
0,Apple Cinnamon Cheerios,110,2,25,29.509541
1,Apple Jacks,110,2,25,33.174094
2,Basic 4,130,3,25,37.038562


In [313]:
# remove new column from df1, so that after concatination it will insert empty values for df2 
df1.drop('My new column', axis=1, inplace=True)
df1

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,Priyanka,110,1,25,27.75


In [315]:
pd.concat([df, df1], ignore_index=True) #ignore index will reset the index number

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [319]:
pd.concat([df, df1]) #won't reset the index values after concatinating

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Priyanka,110,1,25,27.75
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [321]:
df = pd.read_csv('./csv files/cereals.csv')
df

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


In [325]:
pd.concat([df, df1]) #df will now contain the original csv data instead of the modfied df data
# df now contains the origina `name` of 4 index as original instead of Priyanka

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813


### Converting dictinary to dataframe and using Group By

In [332]:
d = {
    'Gender': ['female', 'male', 'female', 'male'],
    'Score': [86, 89, 67, 90]
}

new_df = pd.DataFrame.from_dict(d)
new_df

Unnamed: 0,Gender,Score
0,female,86
1,male,89
2,female,67
3,male,90


In [334]:
new_df = pd.DataFrame(d)
new_df

Unnamed: 0,Gender,Score
0,female,86
1,male,89
2,female,67
3,male,90


In [341]:
group = new_df.groupby(new_df['Gender']).mean() #this will make index values as given column values and perform the mean operation for them

group

Unnamed: 0_level_0,Score
Gender,Unnamed: 1_level_1
female,76.5
male,89.5
