# Data Manipulation

## Creating the Sample Dataset:

In [1]:
import pandas as pd

# Creating a sample dataset for data manipulation
data = {
    'Name': ['Arav', 'Meera', 'Rahul', 'Mohit', 'Sana', 'Priya'],
    'Age': [16, 15, 16, 17, 15, 16],
    'Subject': ['Math', 'English', 'Science', 'Math', 'Science', 'English'],
    'Grade': ['A', 'B', 'A', 'C', 'B', 'A'],
    'Score': [85, 78, 92, 65, 80, 88]
}

# Converting the data into a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

    Name  Age  Subject Grade  Score
0   Arav   16     Math     A     85
1  Meera   15  English     B     78
2  Rahul   16  Science     A     92
3  Mohit   17     Math     C     65
4   Sana   15  Science     B     80
5  Priya   16  English     A     88


## 1. Loading and Viewing Data

In [2]:
# Viewing the first few rows
print(df.head())

    Name  Age  Subject Grade  Score
0   Arav   16     Math     A     85
1  Meera   15  English     B     78
2  Rahul   16  Science     A     92
3  Mohit   17     Math     C     65
4   Sana   15  Science     B     80


In [3]:
# Viewing the last few rows
print(df.tail())

    Name  Age  Subject Grade  Score
1  Meera   15  English     B     78
2  Rahul   16  Science     A     92
3  Mohit   17     Math     C     65
4   Sana   15  Science     B     80
5  Priya   16  English     A     88


In [4]:
# Displaying information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     6 non-null      object
 1   Age      6 non-null      int64 
 2   Subject  6 non-null      object
 3   Grade    6 non-null      object
 4   Score    6 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 368.0+ bytes
None


## 2. Data Cleaning (Handling Duplicates, Renaming, etc.)

In [5]:
# Removing duplicates (if any)
df_cleaned = df.drop_duplicates()

In [6]:
df_cleaned

Unnamed: 0,Name,Age,Subject,Grade,Score
0,Arav,16,Math,A,85
1,Meera,15,English,B,78
2,Rahul,16,Science,A,92
3,Mohit,17,Math,C,65
4,Sana,15,Science,B,80
5,Priya,16,English,A,88


In [7]:
# Renaming columns
df_cleaned = df_cleaned.rename(columns={'Score': 'Exam_Score'})

In [8]:
df_cleaned

Unnamed: 0,Name,Age,Subject,Grade,Exam_Score
0,Arav,16,Math,A,85
1,Meera,15,English,B,78
2,Rahul,16,Science,A,92
3,Mohit,17,Math,C,65
4,Sana,15,Science,B,80
5,Priya,16,English,A,88


In [9]:
# Converting 'Age' to float
df_cleaned['Age'] = df_cleaned['Age'].astype('float')

print(df_cleaned)

    Name   Age  Subject Grade  Exam_Score
0   Arav  16.0     Math     A          85
1  Meera  15.0  English     B          78
2  Rahul  16.0  Science     A          92
3  Mohit  17.0     Math     C          65
4   Sana  15.0  Science     B          80
5  Priya  16.0  English     A          88


## 3. Filtering and Selecting Data

In [10]:
# Selecting specific columns
names_and_scores = df[['Name', 'Score']]

In [11]:
names_and_scores

Unnamed: 0,Name,Score
0,Arav,85
1,Meera,78
2,Rahul,92
3,Mohit,65
4,Sana,80
5,Priya,88


In [12]:
# Filtering students with scores greater than 80
high_scores = df[df['Score'] > 80]

In [13]:
high_scores

Unnamed: 0,Name,Age,Subject,Grade,Score
0,Arav,16,Math,A,85
2,Rahul,16,Science,A,92
5,Priya,16,English,A,88


In [14]:
# Using .loc to select rows and columns
subset = df.loc[0:3, ['Name', 'Subject', 'Score']]

In [15]:
subset

Unnamed: 0,Name,Subject,Score
0,Arav,Math,85
1,Meera,English,78
2,Rahul,Science,92
3,Mohit,Math,65


In [16]:
print(names_and_scores)
print(high_scores)
print(subset)

    Name  Score
0   Arav     85
1  Meera     78
2  Rahul     92
3  Mohit     65
4   Sana     80
5  Priya     88
    Name  Age  Subject Grade  Score
0   Arav   16     Math     A     85
2  Rahul   16  Science     A     92
5  Priya   16  English     A     88
    Name  Subject  Score
0   Arav     Math     85
1  Meera  English     78
2  Rahul  Science     92
3  Mohit     Math     65


## 4. Grouping and Aggregating Data

In [17]:
# Grouping by subject and calculating the mean score
grouped_scores = df.groupby('Subject')['Score'].mean()

In [18]:
grouped_scores

Unnamed: 0_level_0,Score
Subject,Unnamed: 1_level_1
English,83.0
Math,75.0
Science,86.0


In [19]:
# Aggregating with multiple functions
aggregated = df.groupby('Grade').agg({'Score': ['sum', 'mean'], 'Age': 'mean'})

print(grouped_scores)
print(aggregated)

Subject
English    83.0
Math       75.0
Science    86.0
Name: Score, dtype: float64
      Score              Age
        sum       mean  mean
Grade                       
A       265  88.333333  16.0
B       158  79.000000  15.0
C        65  65.000000  17.0


## 5. Merging and Joining DataFrames

In [20]:
# Creating a second DataFrame to merge with
new_data = {
    'Name': ['Arav', 'Meera', 'Rahul', 'Mohit', 'Sana', 'Priya'],
    'Attendance': [90, 85, 88, 75, 95, 80]
}

attendance_df = pd.DataFrame(new_data)

# Merging the two DataFrames on 'Name'
merged_df = pd.merge(df, attendance_df, on='Name')

print(merged_df)

    Name  Age  Subject Grade  Score  Attendance
0   Arav   16     Math     A     85          90
1  Meera   15  English     B     78          85
2  Rahul   16  Science     A     92          88
3  Mohit   17     Math     C     65          75
4   Sana   15  Science     B     80          95
5  Priya   16  English     A     88          80


## 6. Handling Missing Data

In [21]:
# Adding some missing values to the data
df_with_nan = df.copy()
df_with_nan.loc[1, 'Score'] = None

# Checking for missing data
print(df_with_nan.isnull().sum())

# Filling missing data with the mean score
df_filled = df_with_nan.fillna(df_with_nan['Score'].mean())

print(df_filled)

Name       0
Age        0
Subject    0
Grade      0
Score      1
dtype: int64
    Name  Age  Subject Grade  Score
0   Arav   16     Math     A   85.0
1  Meera   15  English     B   82.0
2  Rahul   16  Science     A   92.0
3  Mohit   17     Math     C   65.0
4   Sana   15  Science     B   80.0
5  Priya   16  English     A   88.0


## 7. Modifying DataFrames

In [22]:
# Adding a new column based on conditions
df['Passed'] = df['Score'] >= 70

# Updating values in the 'Grade' column for students with scores over 90
df.loc[df['Score'] > 90, 'Grade'] = 'A+'

# Dropping the 'Passed' column
df = df.drop('Passed', axis=1)

print(df)

    Name  Age  Subject Grade  Score
0   Arav   16     Math     A     85
1  Meera   15  English     B     78
2  Rahul   16  Science    A+     92
3  Mohit   17     Math     C     65
4   Sana   15  Science     B     80
5  Priya   16  English     A     88


## 5. Merging and Joining DataFrames

In [23]:
# Concatenating DataFrames along rows
concat_df = pd.concat([df1, df2], axis=0)

# Concatenating DataFrames along columns
concat_df = pd.concat([df1, df2], axis=1)

NameError: name 'df1' is not defined