#### Q1. List any five functions of the pandas library with execution.

##### 1. read_csv() - used to read CSV files into a DataFrame

In [4]:
import pandas as pd

# Reading a CSV file into a DataFrame
df = pd.read_csv('data.csv')

# Displaying the DataFrame
print(df.head())


                                   Youtube Urls  \
0  https://www.youtube.com//watch?v=nX5ONgCdLcc   
1  https://www.youtube.com//watch?v=AM2Dt7cNebw   
2  https://www.youtube.com//watch?v=7nMJVhey9TM   
3  https://www.youtube.com//watch?v=FSVVlcFUCMk   
4  https://www.youtube.com//watch?v=vKxdTuOirnI   

                                          Thumbnails  \
0  https://i.ytimg.com/vi/nX5ONgCdLcc/maxresdefau...   
1  https://i.ytimg.com/vi/AM2Dt7cNebw/maxresdefau...   
2  https://i.ytimg.com/vi/7nMJVhey9TM/maxresdefau...   
3  https://i.ytimg.com/vi/FSVVlcFUCMk/maxresdefau...   
4  https://i.ytimg.com/vi/vKxdTuOirnI/maxresdefau...   

                                              Titles       Views Upload Time  
0             How to Attempt English Board Exam ????  122K views   1 day ago  
1  Best technique to attempt SST paper in Board e...   54K views  2 days ago  
2  Last Minute Strategy To Score More Than 98% ||...   22K views  3 days ago  
3  Why You Should Choose Commerce After 

##### 2. dropna() - used to remove rows with missing values from a DataFrame

In [5]:
import pandas as pd

# Creating a DataFrame with missing values
df = pd.DataFrame({'A': [1, 2, None, 4], 'B': [5, None, None, 8]})

# Dropping rows with missing values
df = df.dropna()

# Displaying the DataFrame
print(df)


     A    B
0  1.0  5.0
3  4.0  8.0


##### 3. groupby() - used to group data in a DataFrame by one or more columns

In [6]:
import pandas as pd

# Creating a DataFrame
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
                   'Age': [25, 30, 27, 28],
                   'Gender': ['Female', 'Male', 'Male', 'Male']})

# Grouping the data by gender and computing the mean age
grouped = df.groupby('Gender')['Age'].mean()

# Displaying the grouped data
print(grouped)


Gender
Female    25.000000
Male      28.333333
Name: Age, dtype: float64


##### 4. merge() - used to merge two or more DataFrames based on a common column

In [7]:
import pandas as pd

# Creating two DataFrames to merge
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})
df2 = pd.DataFrame({'ID': [1, 2, 4], 'Age': [25, 30, 27]})

# Merging the two DataFrames based on the 'ID' column
merged = pd.merge(df1, df2, on='ID')

# Displaying the merged DataFrame
print(merged)


   ID   Name  Age
0   1  Alice   25
1   2    Bob   30


##### 5. value_counts() - used to count the number of occurrences of unique values in a Series

In [8]:
import pandas as pd

# Creating a Series
s = pd.Series(['A', 'B', 'C', 'A', 'A', 'B'])

# Counting the number of occurrences of each unique value
counts = s.value_counts()

# Displaying the counts
print(counts)


A    3
B    2
C    1
dtype: int64


#### Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [10]:
import pandas as pd

def reindex_df(df):
    # Creating a custom index array
    new_index = pd.RangeIndex(start=1, stop=len(df)*2, step=2)

    # Re-indexing the DataFrame with the custom index
    df = df.set_index(new_index)

    return df


In [11]:
# Creating a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# Re-indexing the DataFrame with a custom index
new_df = reindex_df(df)

# Displaying the new DataFrame
print(new_df)


   A  B  C
1  1  4  7
3  2  5  8
5  3  6  9


#### Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.

#### For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should calculate and print the sum of the first three values, which is 60.

In [12]:
import pandas as pd

def sum_first_three_values(df):
    # Selecting the first three values in the 'Values' column
    first_three_values = df['Values'].iloc[:3]

    # Calculating the sum of the first three values
    sum_first_three = first_three_values.sum()

    # Printing the sum to the console
    print("The sum of the first three values in the 'Values' column is:", sum_first_three)


In [14]:
# Creating a sample DataFrame
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})

# Calculating the sum of the first three values in the 'Values' column
sum_first_three_values(df)


The sum of the first three values in the 'Values' column is: 60


#### Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [16]:
import pandas as pd

def add_word_count(df):
    # Splitting each row of the 'Text' column into words and counting their length
    word_count = df['Text'].apply(lambda x: len(x.split()))

    # Adding the 'Word_Count' column to the DataFrame
    df['Word_Count'] = word_count

    # Returning the updated DataFrame
    return df


In [17]:
# Creating a sample DataFrame
df = pd.DataFrame({'Text': ['This is a sample sentence', 'Another sentence', 'A third sentence with more words']})

# Adding the 'Word_Count' column to the DataFrame
df = add_word_count(df)

# Printing the updated DataFrame
print(df)


                               Text  Word_Count
0         This is a sample sentence           5
1                  Another sentence           2
2  A third sentence with more words           6


#### Q5. How are DataFrame.size() and DataFrame.shape() different?

Ans -
DataFrame.size() returns the total number of elements in a DataFrame, i.e., the product of the number of rows and columns. It returns a scalar value that represents the size of the DataFrame.

DataFrame.shape() returns a tuple of two elements that represent the dimensions of the DataFrame, i.e., the number of rows and columns. The first element of the tuple represents the number of rows, and the second element represents the number of columns.

In [18]:
# Creating a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# Printing the size of the DataFrame
print(df.size)

# Printing the shape of the DataFrame
print(df.shape)


9
(3, 3)


#### Q6. Which function of pandas do we use to read an excel file?

In [None]:
# Reading data from an Excel file into a DataFrame
df = pd.read_excel('file.xlsx')

# Displaying the DataFrame
print(df)


#### Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address.

In [19]:
def extract_username(df):
    # Splitting the 'Email' column into separate columns at the '@' symbol
    split_df = df['Email'].str.split('@', expand=True)
    
    # Selecting the first column (the username) and renaming it to 'Username'
    df['Username'] = split_df[0]
    
    return df


In [22]:
# Creating a sample DataFrame
df = pd.DataFrame({
    'Email': ['john.doe@example.com', 'bob@example.com', 'claire@example.com']
})

extract_username(df)


Unnamed: 0,Email,Username
0,john.doe@example.com,john.doe
1,bob@example.com,bob
2,claire@example.com,claire


#### Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The function should return a new DataFrame that contains only the selected rows. For example, if df contains the following values:
####    A B C
#### 0 3 5 1
#### 1 8 2 7
#### 2 6 9 4
#### 3 2 3 5
#### 4 9 1 2

In [23]:
import pandas as pd

# Creating a sample DataFrame
df = pd.DataFrame({
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
})

def select_rows(df):
    # Selecting rows where A > 5 and B < 10
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    
    # Returning the selected rows
    return selected_rows

# Testing the function
selected_df = select_rows(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


#### Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [24]:
import pandas as pd

def calculate_stats(df):
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_value = df['Values'].std()
    print('Mean:', mean_value)
    print('Median:', median_value)
    print('Standard Deviation:', std_value)


In [25]:
# create example DataFrame
df = pd.DataFrame({'Values': [1, 2, 3, 4, 5]})
calculate_stats(df)


Mean: 3.0
Median: 3.0
Standard Deviation: 1.5811388300841898


#### Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days B for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

In [26]:
import pandas as pd

def calculate_moving_average(df):
    # sort DataFrame by date
    df = df.sort_values('Date')
    # create a new column with the rolling mean of the sales for the past 7 days
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df


In [27]:
# create example DataFrame
df = pd.DataFrame({'Sales': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55],
                   'Date': pd.date_range(start='2022-01-01', periods=10)})
# calculate moving average
df = calculate_moving_average(df)
print(df)


   Sales       Date  MovingAverage
0     10 2022-01-01           10.0
1     15 2022-01-02           12.5
2     20 2022-01-03           15.0
3     25 2022-01-04           17.5
4     30 2022-01-05           20.0
5     35 2022-01-06           22.5
6     40 2022-01-07           25.0
7     45 2022-01-08           30.0
8     50 2022-01-09           35.0
9     55 2022-01-10           40.0


#### Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g. Monday, Tuesday) corresponding to each date in the 'Date' column. For example, if df contains the following values: Date
#### 0 2023-01-01
#### 1 2023-01-02
#### 2 2023-01-03
#### 3 2023-01-04
#### 4 2023-01-05

In [33]:
import pandas as pd

def add_weekday_column(df):
    # Convert 'Date' column to datetime type
    df['Date'] = pd.to_datetime(df['Date'])

    # Extract weekday name from 'Date' column
    df['Weekday'] = df['Date'].dt.day_name()

    # Return modified DataFrame
    return df


In [35]:
# create DataFrame
df = pd.DataFrame({'Date':[" 2023-01-01","2023-01-02","2023-01-03","2023-01-04","2023-01-05"]})
# calculate moving average
df = add_weekday_column(df)
print(df)

        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


#### Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [36]:
import pandas as pd

def select_january_rows(df):
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    return df.loc[mask]


In [37]:
# create DataFrame
df = pd.DataFrame({'Date':[" 2023-01-01","2023-01-02","2023-01-03","2023-01-04","2023-01-05","2023-02-05","2023-02-04","2023-03-05"]})
# calculate moving average
df = select_january_rows(df)
print(df)

         Date
1  2023-01-02
2  2023-01-03
3  2023-01-04
4  2023-01-05


#### Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

Ans - The first and foremost necessary library that needs to be imported to use the basic functions of pandas is pandas itself.

In [38]:
import pandas as pd