In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder


In [2]:
# Load the dataset
df = pd.read_csv("/content/sample_dataset_with_nulls.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_dataset_with_nulls.csv'

In [None]:
# 1.Get the number of rows, columns, datatype, and summary stats of each column of a dataframe. Also, get the array and list equivalent.
print("Number of Rows:", df.shape[0])
print("Number of Columns:", df.shape[1])


In [None]:
print("Data Types:\n", df.dtypes)
print("Summary Stats:\n", df.describe())


In [None]:
print("Array Equivalent:\n", df.values)


In [None]:
print("List Equivalent:\n", df.values.tolist())

In [None]:
# 2. Extract row and column number of a particular cell based on a criterion
criterion = df['Age'] == 'Ages'
row_positions = df.index[criterion].tolist()
col_positions = [df.columns.get_loc('Age')]
print("Row Positions:", row_positions)
print("Column Positions:", col_positions)

In [None]:
# 3. Rename a specific column
df.rename(columns={'Name': 'Person_Name'}, inplace=True)

In [None]:
df.head()

In [None]:
# 4. Count missing values per column
df.isnull().sum()

In [None]:
# 5. Replace missing values in multiple numeric columns with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [None]:
df.head()

In [None]:
# 6. Replace a missing value using the Imputer class
imputer = SimpleImputer(strategy='mean')
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
# 7. Apply function to existing columns with global variables as arguments
global_var = 10
def custom_function(x, multiplier):
    return x * multiplier
df['Updated_Salary'] = df['Salary'].apply(custom_function, args=(global_var,))
df.head()

In [None]:
# 8. Change column order while keeping all columns sorted
df = df[sorted(df.columns)]
df.head()

In [None]:
# 9. Set number of rows & columns displayed
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
df.head()

In [None]:

# 10. Create primary key index by combining columns
df['primary_key'] = df['Person_Name'].astype(str) + '_' + df['Age'].astype(str)
df.set_index('primary_key', inplace=True)
df.head()

In [None]:
# 11. Get row number of nth largest value in a column
n = 5
m=3
row_num = df['Salary'].nlargest(n).index[-1]
print("Row Number of nth largest value:", row_num)

row_num = df['Salary'].nlargest(m).index[-1]
print("Row Number of mth largest value:", row_num)

In [None]:
# 12. Find position of nth largest value greater than a given value
given_value = 50
filtered_df = df[df['Salary'] > given_value]
nth_largest_row = filtered_df['Salary'].nlargest(n).index[-1]
print("Position of nth largest value > given value:", nth_largest_row)

In [None]:
# 13. Get last n rows where row sum > 100
n = 5  # Change as needed
filtered_rows = df[df.select_dtypes(include=[np.number]).sum(axis=1) > 100].tail(n)
print(filtered_rows)

In [None]:
# 14. Create a column with min/max of each row (only for numeric columns)
numeric_df = df.select_dtypes(include=[np.number])
df['min_max_ratio'] = numeric_df.min(axis=1) / numeric_df.max(axis=1)

df.head()

In [None]:
# Ensure the column exists in your dataset
categorical_col = 'Department'

# Apply Label Encoding
label_encoder = LabelEncoder()
df['encoded_col'] = label_encoder.fit_transform(df[categorical_col])

# Apply One-Hot Encoding
one_hot = pd.get_dummies(df[categorical_col], prefix='category')
df = pd.concat([df, one_hot], axis=1)


In [None]:
# 16. Normalize columns using MinMaxScaler
scaler = MinMaxScaler()
df[['Age', 'Experience']] = scaler.fit_transform(df[['Age', 'Experience']])


In [None]:
df.head()

In [None]:
# 17. Normalize columns using Z-score normalization
scaler = StandardScaler()
df[['Experience', 'ID']] = scaler.fit_transform(df[['Experience', 'ID']])

In [None]:
print(df.columns)



In [None]:
# 20. Sort values by a specific column in descending order
df.sort_values(by='Experience', ascending=False, inplace=True)

print("Operations completed successfully!")
df.head()

In [None]:
# 18. Import data from URL
df_url = pd.read_html("http://www.fdic.gov/bank/individual/failed/banklist.html")[0]
print(df_url.head())

In [None]:
print(df_url.columns)

In [None]:
# 18th ki 20 question. Sort values by a specific column in descending order
df_url.sort_values(by='Bank Name', ascending=False, inplace=True)

print("Operations completed successfully!")


In [None]:
print("Unique value count in 'Bank Name':", df_url['Bank Name'].nunique())  # Count of unique values
print(df_url['Bank Name'].value_counts())  # Frequency of each unique value
