Wrong Format

In [1]:
'''
1. to_datetime()
   - Converts a string or other type to datetime.
   - Usage:
        pd.to_datetime(df['date_column'])

        
2. to_numeric()
   - Converts argument to a numeric type (integer or float).
   - Usage:
        pd.to_numeric(df['numeric_column'])

        
3. astype()
   - Casts a pandas object (Series or DataFrame) to a specified type.
   - Usage:
        df['column'] = df['column'].astype(int)

        
4. to_timedelta()
   - Converts argument to timedelta type.
   - Usage:
        pd.to_timedelta(df['time_column'])
 
    td = pd.to_timedelta("2 days 5 hours 30 minutes")
    print(td)  # 2 days 05:30:00
    

5. convert_dtypes()
   - Automatically converts columns to the best possible dtypes.
   - Usage:
        df.convert_dtypes()

6. pd.Categorical()
   - Converts a list-like object to a categorical type (useful for reducing memory usage).
   - Usage:
        df['category_column'] = pd.Categorical(df['category_column'])

     
7. apply(pd.to_numeric)
   - Converts columns to numeric by applying to_numeric across DataFrame columns.
   - Usage:
        df = df.apply(pd.to_numeric, errors='coerce')

        
8. str.strip(), str.lower(), str.upper() (for String Columns)
   - Used to clean or standardize string columns.
   - Usage:
        df['string_column'] = df['string_column'].str.strip().str.lower()

        
9. map()
   - Used to convert or map elements in a column.
   - Usage:
        df['mapped_column'] = df['column'].map({'old_value': 'new_value'})

'''

'\n1. to_datetime()\n   - Converts a string or other type to datetime.\n   - Usage:\n        pd.to_datetime(df[\'date_column\'])\n\n\n2. to_numeric()\n   - Converts argument to a numeric type (integer or float).\n   - Usage:\n        pd.to_numeric(df[\'numeric_column\'])\n\n\n3. astype()\n   - Casts a pandas object (Series or DataFrame) to a specified type.\n   - Usage:\n        df[\'column\'] = df[\'column\'].astype(int)\n\n\n4. to_timedelta()\n   - Converts argument to timedelta type.\n   - Usage:\n        pd.to_timedelta(df[\'time_column\'])\n\n    td = pd.to_timedelta("2 days 5 hours 30 minutes")\n    print(td)  # 2 days 05:30:00\n\n\n5. convert_dtypes()\n   - Automatically converts columns to the best possible dtypes.\n   - Usage:\n        df.convert_dtypes()\n\n6. pd.Categorical()\n   - Converts a list-like object to a categorical type (useful for reducing memory usage).\n   - Usage:\n        df[\'category_column\'] = pd.Categorical(df[\'category_column\'])\n\n\n7. apply(pd.to_nu

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve", None, "Alice"],
    "Age": [25, 30, 35, 40, np.nan, 28, 25],
    "Salary": [50000, 60000, 70000, 80000, 90000, 55000, 50000],
    "JoinDate": ["2022-01-10", "2021-05-15", None, "2023-07-20", "2020-03-01", "2023-01-01", "2022-01-10"],
    "Department": ["HR", "Finance", "IT", "Marketing", None, "HR", "HR"],
    "Bonus": ["5%", "10%", "15%", None, "20%", "5%", "5%"],
}

print('--------original-df--------')
df = pd.DataFrame(data)
print(df)

--------original-df--------
      Name   Age  Salary    JoinDate Department Bonus
0    Alice  25.0   50000  2022-01-10         HR    5%
1      Bob  30.0   60000  2021-05-15    Finance   10%
2  Charlie  35.0   70000        None         IT   15%
3    David  40.0   80000  2023-07-20  Marketing  None
4      Eve   NaN   90000  2020-03-01       None   20%
5     None  28.0   55000  2023-01-01         HR    5%
6    Alice  25.0   50000  2022-01-10         HR    5%


In [4]:
print(df['JoinDate'].dtype)

df['JoinDate'] = pd.to_datetime(df['JoinDate'])  # changing datatype
print(df['JoinDate'].dtype)

object
datetime64[ns]


In [5]:
df.dtypes

Name                  object
Age                  float64
Salary                 int64
JoinDate      datetime64[ns]
Department            object
Bonus                 object
dtype: object

In [6]:
df['Age'].fillna(-20,inplace=True)
print(df)

print("--------Removing Negative Values--------")
for i in df.index:
    if df.loc[i,'Age'] < 0:
        df.loc[i,'Age'] = 0
print(df)

      Name   Age  Salary   JoinDate Department Bonus
0    Alice  25.0   50000 2022-01-10         HR    5%
1      Bob  30.0   60000 2021-05-15    Finance   10%
2  Charlie  35.0   70000        NaT         IT   15%
3    David  40.0   80000 2023-07-20  Marketing  None
4      Eve -20.0   90000 2020-03-01       None   20%
5     None  28.0   55000 2023-01-01         HR    5%
6    Alice  25.0   50000 2022-01-10         HR    5%
--------Removing Negative Values--------
      Name   Age  Salary   JoinDate Department Bonus
0    Alice  25.0   50000 2022-01-10         HR    5%
1      Bob  30.0   60000 2021-05-15    Finance   10%
2  Charlie  35.0   70000        NaT         IT   15%
3    David  40.0   80000 2023-07-20  Marketing  None
4      Eve   0.0   90000 2020-03-01       None   20%
5     None  28.0   55000 2023-01-01         HR    5%
6    Alice  25.0   50000 2022-01-10         HR    5%


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(-20,inplace=True)


Duplicated Rows

In [7]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
df

1


Unnamed: 0,Name,Age,Salary,JoinDate,Department,Bonus
0,Alice,25.0,50000,2022-01-10,HR,5%
1,Bob,30.0,60000,2021-05-15,Finance,10%
2,Charlie,35.0,70000,NaT,IT,15%
3,David,40.0,80000,2023-07-20,Marketing,
4,Eve,0.0,90000,2020-03-01,,20%
5,,28.0,55000,2023-01-01,HR,5%
