# Pandas Tutorial: Basics for Data Analysis


In [1]:

# Import the Pandas library
import pandas as pd

In [2]:
# 1. Creating DataFrames
# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'Salary': [50000, 60000, 70000, 80000]
}
df = pd.DataFrame(data)
print("DataFrame from Dictionary:")
print(df)

DataFrame from Dictionary:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
3    David   40   80000


In [3]:
# Creating a DataFrame from a list of lists
data_list = [[1, 'Alice', 25], [2, 'Bob', 30], [3, 'Charlie', 35]]
columns = ['ID', 'Name', 'Age']
df_list = pd.DataFrame(data_list, columns=columns)
print("\nDataFrame from List of Lists:")
print(df_list)



DataFrame from List of Lists:
   ID     Name  Age
0   1    Alice   25
1   2      Bob   30
2   3  Charlie   35


In [4]:

# Writing to a CSV file
df.to_csv('output.csv', index=False)
print("\nDataFrame saved to 'output.csv'")


DataFrame saved to 'output.csv'


In [5]:
# 2. Reading and Writing Data
# Reading a CSV file (ensure a valid CSV file path)
# Uncomment the line below and provide a valid file path
df_csv = pd.read_csv('output.csv')
print("\nDataFrame from CSV:")
print(df_csv)


DataFrame from CSV:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
3    David   40   80000


In [6]:
# 3. Inspecting Data
print("\nFirst 2 Rows of DataFrame:")
print(df.head(2))



First 2 Rows of DataFrame:
    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000


In [7]:
print("\nLast 2 Rows of DataFrame:")
print(df.tail(2))


Last 2 Rows of DataFrame:
      Name  Age  Salary
2  Charlie   35   70000
3    David   40   80000


In [8]:

print("\nDataFrame Info:")
df.info()



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Salary  4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


In [9]:
print("\nDataFrame Summary Statistics:")
print(df.describe())



DataFrame Summary Statistics:
             Age        Salary
count   4.000000      4.000000
mean   32.500000  65000.000000
std     6.454972  12909.944487
min    25.000000  50000.000000
25%    28.750000  57500.000000
50%    32.500000  65000.000000
75%    36.250000  72500.000000
max    40.000000  80000.000000


In [10]:
# 4. Selecting and Filtering Data
# Selecting a single column
print("\nSelecting 'Name' Column:")
print(df['Name'])


Selecting 'Name' Column:
0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object


In [11]:
# Selecting multiple columns
print("\nSelecting 'Name' and 'Salary' Columns:")
print(df[['Name', 'Salary']])


Selecting 'Name' and 'Salary' Columns:
      Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   70000
3    David   80000


In [12]:

# Filtering rows based on a condition
print("\nFiltering Rows where Age > 30:")
print(df[df['Age'] > 30])



Filtering Rows where Age > 30:
      Name  Age  Salary
2  Charlie   35   70000
3    David   40   80000


In [13]:
# Filtering with multiple conditions
print("\nFiltering Rows where Age > 30 and Salary > 60000:")
print(df[(df['Age'] > 30) & (df['Salary'] > 60000)])


Filtering Rows where Age > 30 and Salary > 60000:
      Name  Age  Salary
2  Charlie   35   70000
3    David   40   80000


In [14]:
# 5. Adding and Modifying Columns
# Adding a new column
df['Bonus'] = df['Salary'] * 0.1
print("\nDataFrame with Bonus Column:")
print(df)



DataFrame with Bonus Column:
      Name  Age  Salary   Bonus
0    Alice   25   50000  5000.0
1      Bob   30   60000  6000.0
2  Charlie   35   70000  7000.0
3    David   40   80000  8000.0


In [15]:
# Modifying an existing column
df['Salary'] = df['Salary'] + 5000
print("\nUpdated Salary Column:")
print(df)



Updated Salary Column:
      Name  Age  Salary   Bonus
0    Alice   25   55000  5000.0
1      Bob   30   65000  6000.0
2  Charlie   35   75000  7000.0
3    David   40   85000  8000.0


In [16]:
# Renaming columns
df.rename(columns={'Name': 'Employee Name', 'Salary': 'Total Salary'}, inplace=True)
print("\nRenamed Columns:")
print(df)



Renamed Columns:
  Employee Name  Age  Total Salary   Bonus
0         Alice   25         55000  5000.0
1           Bob   30         65000  6000.0
2       Charlie   35         75000  7000.0
3         David   40         85000  8000.0


In [17]:

# Dropping a column
df.drop(columns=['Bonus'], inplace=True)
print("\nDataFrame after Dropping Bonus Column:")
print(df)


DataFrame after Dropping Bonus Column:
  Employee Name  Age  Total Salary
0         Alice   25         55000
1           Bob   30         65000
2       Charlie   35         75000
3         David   40         85000


In [18]:
# 6. Handling Missing Data
# Creating a DataFrame with missing values
data_with_nan = {
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, None, 35, 40],
    'Salary': [50000, 60000, None, 80000]
}
df_nan = pd.DataFrame(data_with_nan)
print("\nDataFrame with Missing Values:")
print(df_nan)



DataFrame with Missing Values:
    Name   Age   Salary
0  Alice  25.0  50000.0
1    Bob   NaN  60000.0
2   None  35.0      NaN
3  David  40.0  80000.0


In [19]:

# Filling missing values
df_nan_filled = df_nan.fillna({'Age': df_nan['Age'].mean(), 'Salary': 0})
print("\nFilled Missing Values:")
print(df_nan_filled)



Filled Missing Values:
    Name        Age   Salary
0  Alice  25.000000  50000.0
1    Bob  33.333333  60000.0
2   None  35.000000      0.0
3  David  40.000000  80000.0


In [20]:
# Dropping rows with missing values
df_nan_dropped = df_nan.dropna()
print("\nDropped Rows with Missing Values:")
print(df_nan_dropped)


Dropped Rows with Missing Values:
    Name   Age   Salary
0  Alice  25.0  50000.0
3  David  40.0  80000.0


In [21]:
# Checking for missing values
print("\nMissing Values in DataFrame:")
print(df_nan.isnull().sum())


Missing Values in DataFrame:
Name      1
Age       1
Salary    1
dtype: int64


In [22]:
# 7. Grouping and Aggregating Data
# Grouping by a column and calculating the mean
print("\nGrouping by Age and Calculating Mean Salary:")
grouped = df.groupby('Age')['Total Salary'].mean()
print(grouped)


Grouping by Age and Calculating Mean Salary:
Age
25    55000.0
30    65000.0
35    75000.0
40    85000.0
Name: Total Salary, dtype: float64


In [23]:
# Aggregating multiple functions
print("\nAggregating Mean and Sum of Total Salary:")
aggs = df.groupby('Age')['Total Salary'].agg(['mean', 'sum'])
print(aggs)


Aggregating Mean and Sum of Total Salary:
        mean    sum
Age                
25   55000.0  55000
30   65000.0  65000
35   75000.0  75000
40   85000.0  85000


In [24]:

# 8. Sorting Data
# Sorting by a single column
print("\nSorting by Age:")
sorted_df = df.sort_values(by='Age')
print(sorted_df)



Sorting by Age:
  Employee Name  Age  Total Salary
0         Alice   25         55000
1           Bob   30         65000
2       Charlie   35         75000
3         David   40         85000


In [25]:

# Sorting by multiple columns
print("\nSorting by Age and Total Salary:")
sorted_multi_df = df.sort_values(by=['Age', 'Total Salary'], ascending=[True, False])
print(sorted_multi_df)



Sorting by Age and Total Salary:
  Employee Name  Age  Total Salary
0         Alice   25         55000
1           Bob   30         65000
2       Charlie   35         75000
3         David   40         85000


In [26]:

# 9. Merging and Joining DataFrames
# Creating another DataFrame for merging
departments = {
    'Employee Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'Engineering', 'Marketing', 'Finance']
}
df_dept = pd.DataFrame(departments)

In [27]:

# Merging on 'Employee Name'
merged_df = pd.merge(df, df_dept, on='Employee Name')
print("\nMerged DataFrame:")
print(merged_df)



Merged DataFrame:
  Employee Name  Age  Total Salary   Department
0         Alice   25         55000           HR
1           Bob   30         65000  Engineering
2       Charlie   35         75000    Marketing
3         David   40         85000      Finance


In [28]:


# 10. Applying Functions
# Applying a function to a column
def categorize_salary(salary):
    return 'High' if salary > 60000 else 'Low'

df['Salary Category'] = df['Total Salary'].apply(categorize_salary)
print("\nDataFrame with Salary Category:")
print(df)


DataFrame with Salary Category:
  Employee Name  Age  Total Salary Salary Category
0         Alice   25         55000             Low
1           Bob   30         65000            High
2       Charlie   35         75000            High
3         David   40         85000            High


In [30]:

# Save DataFrame to JSON
df.to_json('output.json', orient='records')
print("\nDataFrame saved to 'output.json'")



DataFrame saved to 'output.json'


In [31]:
# Read DataFrame from JSON
json_df = pd.read_json('output.json')
print("\nDataFrame read from JSON:")
print(json_df)


DataFrame read from JSON:
  Employee Name  Age  Total Salary Salary Category
0         Alice   25         55000             Low
1           Bob   30         65000            High
2       Charlie   35         75000            High
3         David   40         85000            High


In [32]:


# 12. Additional DataFrame Operations
# Resetting the index
reset_df = df.reset_index(drop=True)
print("\nDataFrame after Resetting Index:")
print(reset_df)



DataFrame after Resetting Index:
  Employee Name  Age  Total Salary Salary Category
0         Alice   25         55000             Low
1           Bob   30         65000            High
2       Charlie   35         75000            High
3         David   40         85000            High


In [33]:

# Setting a column as the index
indexed_df = df.set_index('Employee Name')
print("\nDataFrame with 'Employee Name' as Index:")
print(indexed_df)



DataFrame with 'Employee Name' as Index:
               Age  Total Salary Salary Category
Employee Name                                   
Alice           25         55000             Low
Bob             30         65000            High
Charlie         35         75000            High
David           40         85000            High


In [34]:

# Dropping rows by index
indexed_df_dropped = indexed_df.drop(['Alice'])
print("\nDataFrame after Dropping 'Alice':")
print(indexed_df_dropped)



DataFrame after Dropping 'Alice':
               Age  Total Salary Salary Category
Employee Name                                   
Bob             30         65000            High
Charlie         35         75000            High
David           40         85000            High
