<a href="https://colab.research.google.com/github/rohitrath0d/dataScience/blob/main/DataFrames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd  # To create dataset
import warnings as w
w.filterwarnings('ignore')

# Creating the dataset (Note: Your original dataset creation has some syntax issues)
data = {
    'Job Position': ['CEO', 'Senior Manager', 'Junior Manager', 'Employee', 'Assistant Staff'],
    'Years of Experience': [5, 4, 3, None, 1],
    'Salary': [100000, 80000, None, 40000, 20000]
}

# Create DataFrame
df = pd.DataFrame(data)

# View the dataset
print("Original DataFrame:")
print(df)

Original DataFrame:
      Job Position  Years of Experience    Salary
0              CEO                  5.0  100000.0
1   Senior Manager                  4.0   80000.0
2   Junior Manager                  3.0       NaN
3         Employee                  NaN   40000.0
4  Assistant Staff                  1.0   20000.0


In [None]:
# 2. Basic data pre-processing tasks: Handling missing values and outliers

# Handling missing values
df['Years of Experience'].fillna(df['Years of Experience'].median(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df

Unnamed: 0,Job Position,Years of Experience,Salary
0,CEO,5.0,100000.0
1,Senior Manager,4.0,80000.0
2,Junior Manager,3.0,60000.0
3,Employee,3.5,40000.0
4,Assistant Staff,1.0,20000.0


In [None]:
# Handling outliers in the 'Salary' column using winsorizing
from scipy.stats import mstats
df['Salary'] = mstats.winsorize(df['Salary'], limits=[0.05, 0.05])
df['Salary']

Unnamed: 0,Salary
0,100000.0
1,80000.0
2,60000.0
3,40000.0
4,20000.0


In [None]:

# 3. Data Manipulation and Transformation: Filtering, Sorting, Grouping

# Filtering: Selecting rows where 'Years of Experience' is not null
df_filtered = df[df['Years of Experience'].notnull()]

# Sorting: Sorting the DataFrame by 'Salary' in descending order
df_sorted = df.sort_values(by='Salary', ascending=False)

# Grouping: Calculating the average salary for each job position
df_grouped = df.groupby('Job Position')['Salary'].mean().reset_index()

# Viewing the results
print("\nFiltered DataFrame (Rows with non-null 'Years of Experience'):")
print(df_filtered)

print("\nSorted DataFrame (Descending order by 'Salary'):")
print(df_sorted)

print("\nGrouped DataFrame (Average salary for each job position):")
print(df_grouped)



Filtered DataFrame (Rows with non-null 'Years of Experience'):
      Job Position  Years of Experience    Salary
0              CEO                  5.0  100000.0
1   Senior Manager                  4.0   80000.0
2   Junior Manager                  3.0   60000.0
3         Employee                  3.5   40000.0
4  Assistant Staff                  1.0   20000.0

Sorted DataFrame (Descending order by 'Salary'):
      Job Position  Years of Experience    Salary
0              CEO                  5.0  100000.0
1   Senior Manager                  4.0   80000.0
2   Junior Manager                  3.0   60000.0
3         Employee                  3.5   40000.0
4  Assistant Staff                  1.0   20000.0

Grouped DataFrame (Average salary for each job position):
      Job Position    Salary
0  Assistant Staff   20000.0
1              CEO  100000.0
2         Employee   40000.0
3   Junior Manager   60000.0
4   Senior Manager   80000.0
