# Creating date column based on other columns

**SITUATION:** You have dates broken up in 3 columns and need to create an actual column with dates.

In [1]:
import pandas as pd

In [2]:
df = pd.read_clipboard()

In [3]:
df

Unnamed: 0,MONTH,DAY,YEAR
0,7,1,2019
1,7,2,2019
2,7,3,2019
3,7,4,2019
4,7,5,2019
5,7,6,2019
6,7,7,2019
7,7,8,2019
8,7,9,2019
9,7,10,2019


In [5]:
df['DATE'] = pd.to_datetime(df[['MONTH', 'DAY', 'YEAR']])

In [6]:
df

Unnamed: 0,MONTH,DAY,YEAR,DATE
0,7,1,2019,2019-07-01
1,7,2,2019,2019-07-02
2,7,3,2019,2019-07-03
3,7,4,2019,2019-07-04
4,7,5,2019,2019-07-05
5,7,6,2019,2019-07-06
6,7,7,2019,2019-07-07
7,7,8,2019,2019-07-08
8,7,9,2019,2019-07-09
9,7,10,2019,2019-07-10


In [7]:
df.dtypes

MONTH             int64
DAY               int64
YEAR              int64
DATE     datetime64[ns]
dtype: object

# Combining string values from multiple columns into a single column

**SITUATION:** You have a first name column and a last name column.  But you need a column that is of the format: last name, first name

In [69]:
df = pd.read_clipboard()
df

Unnamed: 0,ID,FirstName,LastName
0,1,John,Smith
1,2,Jane,Doe
2,3,Someone,Else


In [70]:
df['LastFirstName'] = df['LastName'] + ', ' + df['FirstName']
df

Unnamed: 0,ID,FirstName,LastName,LastFirstName
0,1,John,Smith,"Smith, John"
1,2,Jane,Doe,"Doe, Jane"
2,3,Someone,Else,"Else, Someone"


# Split up string into multiple columns

**SITUATION:** You have a column with string values, but need the string value broken up into multiple columns.

In [9]:
df = pd.read_clipboard()

In [10]:
df

Unnamed: 0,ID,LastFirstName
0,1,John Smith
1,2,Jane Doe
2,3,Someone Else


In [11]:
df[['FirstName', 'LastName']] = df['LastFirstName'].str.split(' ', expand=True)

In [12]:
df

Unnamed: 0,ID,LastFirstName,FirstName,LastName
0,1,John Smith,John,Smith
1,2,Jane Doe,Jane,Doe
2,3,Someone Else,Someone,Else


#### You don't have to consume all the split values into multiple columns, you can choose to keep the first split value or second value using indexing ( [0] or [1] ):

In [33]:
df = pd.read_clipboard()
df

Unnamed: 0,ID,LOCATION
0,1,"Los Angeles, CA"
1,2,"Washington, D.C."
2,3,"Columbus, OH"


In [35]:
df['CITY'] = df['LOCATION'].str.split(', ', expand=True)[0]

In [36]:
df

Unnamed: 0,ID,LOCATION,CITY
0,1,"Los Angeles, CA",Los Angeles
1,2,"Washington, D.C.",Washington
2,3,"Columbus, OH",Columbus


In [38]:
df['STATE'] = df['LOCATION'].str.split(', ', expand=True)[1]

In [39]:
df

Unnamed: 0,ID,LOCATION,CITY,STATE
0,1,"Los Angeles, CA",Los Angeles,CA
1,2,"Washington, D.C.",Washington,D.C.
2,3,"Columbus, OH",Columbus,OH


# Clean columns of a specific data type

**SITUATION:** There are hashtag characters in your string columns.  You could have several string columns.  We need to remove them.  There are 3 different ways depending on level of control.

In [15]:
df = pd.read_clipboard()

In [16]:
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


### Method 1: Using ```isinstance()``` to check for data type:

In [17]:
def remove_hashtag1(df):
    for column in df.columns:
        # if first value of column is of type str, then remove the hashtags
        if isinstance(df[column][0], str):
            df[column] = df[column].str.replace('#', '')

In [18]:
remove_hashtag1(df)

In [19]:
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,Hello,1,4,Steve,Smith
1,GoodBye,2,5,Jane,Doe
2,Welcome,3,6,John,Hicks


### Method 2: Using ```dtype()``` to check for data type:

In [22]:
def remove_hashtag2(df):
    for column in df.columns:
        # if column is of type 'object', then remove the hashtags
        if df[column].dtype == 'object':
            df[column] = df[column].str.replace('#', '')

In [20]:
df = pd.read_clipboard()
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


In [23]:
remove_hashtag2(df)

In [24]:
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,Hello,1,4,Steve,Smith
1,GoodBye,2,5,Jane,Doe
2,Welcome,3,6,John,Hicks


### Method 3: Using  ```is_string_dtype``` to detect column's data type:

In [25]:
from pandas.api.types import is_string_dtype

In [26]:
df = pd.read_clipboard()
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


In [29]:
def remove_hashtag3(df):
    for column in df.columns:
        # if column is of a string data type, then strip the white space
        if is_string_dtype(df[column]):
            df[column] = df[column].str.replace('#', '')

In [30]:
remove_hashtag3(df)

In [31]:
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,Hello,1,4,Steve,Smith
1,GoodBye,2,5,Jane,Doe
2,Welcome,3,6,John,Hicks


# Select Columns by Data Type

**SITUATION:** You need to isolate numeric columns from string columns or vice versa

In [52]:
df = pd.read_clipboard()
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


Get count of columns by data type:

In [55]:
df.dtypes.value_counts()

object    3
int64     2
dtype: int64

In [41]:
df_num = df.select_dtypes(include='number')
df_num

Unnamed: 0,Numeric1,Numeric2
0,1,4
1,2,5
2,3,6


In [42]:
df_str = df.select_dtypes(include='object')
df_str

Unnamed: 0,str_BadData1,str_BadData2,str_BadData3
0,#Hello#,#Steve#,#Smith#
1,#GoodBye#,#Jane#,#Doe#
2,#Welcome#,#John#,#Hicks#


# Adding prefix or suffix to your column names

In [50]:
df = pd.read_clipboard()
df

Unnamed: 0,str_BadData1,Numeric1,Numeric2,str_BadData2,str_BadData3
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


In [51]:
df = df.add_prefix('BEGIN_')
df

Unnamed: 0,BEGIN_str_BadData1,BEGIN_Numeric1,BEGIN_Numeric2,BEGIN_str_BadData2,BEGIN_str_BadData3
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


In [49]:
df = df.add_suffix('_END')
df

Unnamed: 0,str_BadData1_END,Numeric1_END,Numeric2_END,str_BadData2_END,str_BadData3_END
0,#Hello#,1,4,#Steve#,#Smith#
1,#GoodBye#,2,5,#Jane#,#Doe#
2,#Welcome#,3,6,#John#,#Hicks#


# Finding duplicate records

In [56]:
df = pd.read_clipboard()
df

Unnamed: 0,K1,K2
0,one,1
1,one,2
2,one,3
3,two,3
4,two,3
5,two,4
6,two,4


In [57]:
duplicate_records = df[df.duplicated()]
duplicate_records

Unnamed: 0,K1,K2
4,two,3
6,two,4


# Removing duplicate records

In [58]:
df = pd.read_clipboard()
df

Unnamed: 0,K1,K2
0,one,1
1,one,2
2,one,3
3,two,3
4,two,3
5,two,4
6,two,4


In [59]:
df.drop_duplicates()  # by default, duplicate is defined by all columns

Unnamed: 0,K1,K2
0,one,1
1,one,2
2,one,3
3,two,3
5,two,4


or define what is "duplicate" by column name(s):

In [60]:
df = pd.read_clipboard()
df

Unnamed: 0,K1,K2
0,one,1
1,one,2
2,one,3
3,two,3
4,two,3
5,two,4
6,two,4


In [61]:
df.drop_duplicates(subset='K1')  # Record is considered "duplicated" based on column K1

Unnamed: 0,K1,K2
0,one,1
3,two,3


# Create a categorical column based on numeric values or a.k.a. assigning numbers to categories based on pre-defined ranges

**SITUATION:** You have people's ages, but now you need to assign their age to: 'CHILD', 'ADULT', or 'ELDERLY' category based on their age

In [71]:
import numpy as np

Let's create 50 random integers to represent age whose values are between 1 and 99:

In [77]:
random_integers = np.random.randint(1, 99, 50)

In [78]:
random_integers

array([29, 20, 46, 89, 47, 23, 65, 49, 66, 31,  6, 65, 90, 30, 92, 63, 31,
       18, 66, 58, 76, 17, 54, 96, 95, 34, 95, 97, 56, 53, 92, 95, 64, 83,
       82, 25, 47, 33, 95, 60, 60, 98, 32, 93, 33, 28,  7, 60, 20,  7])

In [79]:
df = pd.DataFrame({'AGE': random_integers})

In [80]:
df.head(10)

Unnamed: 0,AGE
0,29
1,20
2,46
3,89
4,47
5,23
6,65
7,49
8,66
9,31


In [81]:
df['AGE_GROUP'] = pd.cut(df['AGE'], bins=[0, 18, 65, 99], labels=['CHILD', 'ADULT', 'ELDERLY'])
df.head(15)

Unnamed: 0,AGE,AGE_GROUP
0,29,ADULT
1,20,ADULT
2,46,ADULT
3,89,ELDERLY
4,47,ADULT
5,23,ADULT
6,65,ADULT
7,49,ADULT
8,66,ELDERLY
9,31,ADULT
