In [1]:
import pandas as pd
df = pd.DataFrame([{'col1':'a', 'col2':'1'}, 
                   {'col1':'b', 'col2':'2'}])
df.dtypes


col1    object
col2    object
dtype: object

In [2]:
df

Unnamed: 0,col1,col2
0,a,1
1,b,2


In [3]:
df['col2-int'] = df['col2'].astype(int)    # ①
df


Unnamed: 0,col1,col2,col2-int
0,a,1,1
1,b,2,2


In [4]:
df.dtypes

col1        object
col2        object
col2-int     int64
dtype: object

In [5]:
s = pd.Series(['1', '2', '4.7', 'pandas', '10'])    
s.astype(float)


ValueError: could not convert string to float: 'pandas'

In [6]:
s.astype(float, errors='ignore')

0         1
1         2
2       4.7
3    pandas
4        10
dtype: object

In [7]:
pd.to_numeric(s)

ValueError: Unable to parse string "pandas" at position 3

In [8]:
pd.to_numeric(s, errors='coerce')

0     1.0
1     2.0
2     4.7
3     NaN
4    10.0
dtype: float64

In [10]:
df = pd.read_csv("/Users/qiwsir/Documents/Codes/DataSet/sales-data/sales_data_types.csv")
df.shape

(5, 10)

In [12]:
df[['Customer Number', '2016']]

Unnamed: 0,Customer Number,2016
0,10002.0,"$125,000.00"
1,552278.0,"$920,000.00"
2,23477.0,"$50,000.00"
3,24900.0,"$350,000.00"
4,651029.0,"$15,000.00"


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
Customer Number    5 non-null float64
Customer Name      5 non-null object
2016               5 non-null object
2017               5 non-null object
Percent Growth     5 non-null object
Jan Units          5 non-null object
Month              5 non-null int64
Day                5 non-null int64
Year               5 non-null int64
Active             5 non-null object
dtypes: float64(1), int64(3), object(6)
memory usage: 480.0+ bytes


In [14]:
df['Customer Number'].astype(int).astype(str)

0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: object

In [15]:
def convert_money(value):
    new_value = value.replace("$","").replace(",","")  # ②
    return float(new_value)

df['2016'].apply(convert_money)    # ③

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64

In [17]:
df[['Percent Growth']]

Unnamed: 0,Percent Growth
0,30.00%
1,10.00%
2,25.00%
3,4.00%
4,-15.00%


In [18]:
df['Percent Growth'].apply(lambda x: float(x.replace("%", "")) / 100)


0    0.30
1    0.10
2    0.25
3    0.04
4   -0.15
Name: Percent Growth, dtype: float64

In [19]:
df[['Jan Units']]

Unnamed: 0,Jan Units
0,500
1,700
2,125
3,75
4,Closed


In [20]:
pd.to_numeric(df['Jan Units'], errors='coerce')

0    500.0
1    700.0
2    125.0
3     75.0
4      NaN
Name: Jan Units, dtype: float64

In [21]:
df[['Active']]

Unnamed: 0,Active
0,Y
1,Y
2,Y
3,Y
4,N


In [22]:
import numpy as np
np.where(df['Active']=='Y', 1, 0)


array([1, 1, 1, 1, 0])

In [23]:
import pandas as pd
import numpy as np

def convert_money(value):
    new_value = value.replace(",","").replace("$","")
    return float(new_value)

df2 = pd.read_csv("/Users/qiwsir/Documents/Codes/DataSet/sales-data/sales_data_types.csv",
                  dtype = {'Customer Number': 'int'},
                  converters = {'2016': convert_money,
                                '2017': convert_money,
                                'Percent Growth': lambda x: float(x.replace("%", "")) / 100,
                                'Jan Units': lambda x: pd.to_numeric(x, errors='coerce'),
                                'Active': lambda x: np.where(x =='Y', 1, 0),
                               })
df2['Date'] = pd.to_datetime(df[['Month', 'Day', 'Year']])

In [26]:
cols = df2.columns
len(cols)

11

In [29]:
df2[cols[:4]]

Unnamed: 0,Customer Number,Customer Name,2016,2017
0,10002,Quest Industries,125000.0,162500.0
1,552278,Smith Plumbing,920000.0,1012000.0
2,23477,ACME Industrial,50000.0,62500.0
3,24900,Brekke LTD,350000.0,490000.0
4,651029,Harbor Co,15000.0,12750.0


In [30]:
df2[cols[4:]]

Unnamed: 0,Percent Growth,Jan Units,Month,Day,Year,Active,Date
0,0.3,500.0,1,10,2015,1,2015-01-10
1,0.1,700.0,6,15,2014,1,2014-06-15
2,0.25,125.0,3,29,2016,1,2016-03-29
3,0.04,75.0,10,27,2015,1,2015-10-27
4,-0.15,,2,2,2014,0,2014-02-02


In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 11 columns):
Customer Number    5 non-null int64
Customer Name      5 non-null object
2016               5 non-null float64
2017               5 non-null float64
Percent Growth     5 non-null float64
Jan Units          4 non-null float64
Month              5 non-null int64
Day                5 non-null int64
Year               5 non-null int64
Active             5 non-null object
Date               5 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(4), object(2)
memory usage: 520.0+ bytes
