In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana", "Evan"],
    "age": [25, 30, None, 22, 40],
    "city": ["NY", "LA", "NY", "SF", "LA"],
    "height_m": [1.65, 1.80, 1.75, 1.60, 1.90],
    "weight_kg": [55, 82, 77, 60, 90],
    "salary": [72000, 85000, 56000, 63000, 120000]
})

In [3]:
df.loc[:,'age']
# df.iloc[0]

0    25.0
1    30.0
2     NaN
3    22.0
4    40.0
Name: age, dtype: float64

In [4]:
df["newColumn"] = df["age"]/2

In [5]:
# df.groupby('newColumn')
newPrint = df.groupby('newColumn')
print(newPrint)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B43F76C440>


In [6]:
df["newColumn"]

0    12.5
1    15.0
2     NaN
3    11.0
4    20.0
Name: newColumn, dtype: float64

In [7]:
print(df.head())
print(df.info())
print(df.describe())

      name   age city  height_m  weight_kg  salary  newColumn
0    Alice  25.0   NY      1.65         55   72000       12.5
1      Bob  30.0   LA      1.80         82   85000       15.0
2  Charlie   NaN   NY      1.75         77   56000        NaN
3    Diana  22.0   SF      1.60         60   63000       11.0
4     Evan  40.0   LA      1.90         90  120000       20.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       5 non-null      object 
 1   age        4 non-null      float64
 2   city       5 non-null      object 
 3   height_m   5 non-null      float64
 4   weight_kg  5 non-null      int64  
 5   salary     5 non-null      int64  
 6   newColumn  4 non-null      float64
dtypes: float64(3), int64(2), object(2)
memory usage: 412.0+ bytes
None
             age  height_m  weight_kg         salary  newColumn
count   4.000000  5.000000   5.0

In [8]:
df = df.drop_duplicates()
df["age"] = df["age"].fillna(df["age"].median())

In [9]:
def age_category(age):
    if pd.isna(age):
        return "unknown"
    elif age < 25:
        return "young"
    elif age < 35:
        return "adult"
    else:
        return "senior"

df["age_category"] = df["age"].apply(age_category)


def bmi_category(weight, height):
    bmi = weight / (height ** 2)
    if bmi < 18.5:
        return "underweight"
    elif bmi < 25:
        return "normal"
    elif bmi < 30:
        return "overweight"
    else:
        return "obese"

In [10]:
df["bmi"] = df["weight_kg"] / df["height_m"]**2


In [11]:
adults = df[df["age"] >= 30]
subset = df[["name", "city", "salary"]]

In [12]:
avg_salary = df.groupby("city")["salary"].mean().reset_index()


In [13]:
top_salaries = df.sort_values("salary", ascending=False)


In [14]:
dept = pd.DataFrame({
    "name": ["Alice", "Bob", "Charlie", "Diana", "Evan"],
    "department": ["HR", "IT", "Finance", "Marketing", "IT"]
})

In [15]:
import pandas as pd

names = ["Alice", "Bob", "Charlie", "Dana"]
salary = [70000, 85000, 65000, 90000]
bonus = [5000, 8000, 3000, 10000]

df = pd.DataFrame({
    "name": names,
    "salary": salary,
    "bonus": bonus,
})

df["total_comp"] = df["salary"] + df["bonus"]

high_comp = df[df["total_comp"] > 80_000]

print(df)
print("\nTotal comp > 80k:\n", high_comp)

      name  salary  bonus  total_comp
0    Alice   70000   5000       75000
1      Bob   85000   8000       93000
2  Charlie   65000   3000       68000
3     Dana   90000  10000      100000

Total comp > 80k:
    name  salary  bonus  total_comp
1   Bob   85000   8000       93000
3  Dana   90000  10000      100000


In [16]:
import pandas as pd

In [17]:
import pandas as pd
df = pd.read_csv("employees.csv", encoding = 'ISO-8859-1')

In [18]:
print(df[['name','department','salary']].head())


          name    department  salary
0  Alice Zhang  Data Science  120000
1    Bob Smith   Engineering   95000
2    Carol Lee  Data Science  105000
3    David Kim       Product   88000
4   Eva Müller     Marketing   72000


In [19]:
print(df[df['salary'] > 90000])


    employee_id           name    department  salary  years_experience  office
0             1    Alice Zhang  Data Science  120000               4.5      NY
1             2      Bob Smith   Engineering   95000               3.0      SF
2             3      Carol Lee  Data Science  105000               5.0  Remote
5             6  Frank Johnson  Data Science   98000               2.0      SF
6             7     Grace Park   Engineering  110000               6.0      NY
9            10    Jack Wilson       Product  102000               5.5      SF
12           13    Maria Lopez   Engineering   97000               4.0  Remote
13           14   Nathan Clark       Finance  115000               7.0      NY
14           15   Olivia Perez  Data Science  132000               8.0      SF
16           17   Quinn Taylor       Product   91000               3.0  London


In [20]:
print(df[(df['department'] == 'Data Science') & ( df['years_experience'] >= 3)])


    employee_id          name    department  salary  years_experience  office
0             1   Alice Zhang  Data Science  120000               4.5      NY
2             3     Carol Lee  Data Science  105000               5.0  Remote
14           15  Olivia Perez  Data Science  132000               8.0      SF


In [21]:
print(df.groupby('department')['salary'].mean())


department
Data Science    103166.666667
Engineering      96250.000000
Finance          96500.000000
HR               65000.000000
Marketing        70500.000000
Product          93666.666667
Name: salary, dtype: float64


In [22]:
print(df.groupby('office').count().iloc[:,0])


office
Berlin    3
London    3
NY        4
Remote    4
SF        4
Name: employee_id, dtype: int64


In [23]:
print(df.sort_values('salary',ascending=False)[:5]['name'])


14    Olivia Perez
0      Alice Zhang
13    Nathan Clark
6       Grace Park
2        Carol Lee
Name: name, dtype: object


In [24]:
print(df.groupby('department')['salary'].nlargest(3).rename(df['name']))


department                
Data Science  Olivia Perez    132000
              Alice Zhang     120000
              Carol Lee       105000
Engineering   Grace Park      110000
              Maria Lopez      97000
              Bob Smith        95000
Finance       Nathan Clark    115000
              Henry Brown      78000
HR            Irene García     65000
Marketing     Eva Müller       72000
              Karen Davis      69000
Product       Jack Wilson     102000
              Quinn Taylor     91000
              David Kim        88000
Name: salary, dtype: int64


In [25]:
df['salary_per_year_exp'] = df['salary'] / df['years_experience'].replace(0,1)


In [26]:
print(df.head())

   employee_id         name    department  salary  years_experience  office  \
0            1  Alice Zhang  Data Science  120000               4.5      NY   
1            2    Bob Smith   Engineering   95000               3.0      SF   
2            3    Carol Lee  Data Science  105000               5.0  Remote   
3            4    David Kim       Product   88000               4.0      NY   
4            5   Eva Müller     Marketing   72000               2.5  Berlin   

   salary_per_year_exp  
0         26666.666667  
1         31666.666667  
2         21000.000000  
3         22000.000000  
4         28800.000000  


In [27]:
print(df[['name','department','salary']].head())


          name    department  salary
0  Alice Zhang  Data Science  120000
1    Bob Smith   Engineering   95000
2    Carol Lee  Data Science  105000
3    David Kim       Product   88000
4   Eva Müller     Marketing   72000


In [None]:
df.drop_duplicates()
df.drop(columns="thisColumn")
df["last_name"].str.strip()
df["last_name"]= df["last_name"].str.rstrip() ##or left strip
df["Phone_Number"].str.replace('[^a-zA-zo-9]','')

##lambda x: str(x)
df["Phone_Number"].apply(lamda x: x[0:3] + '-' + x[3:6] + '-' + x[6:10])

for x in df.index:
    if df.loc[x, "Do_Not_Contact"] == 'Y':
        df.drop(x, inplace = True)