# Pandas How to add new column to existing DataFrame

* add completely new column
* add new column based on existing column
* matching the content of the DataFrame

Bonus
* how to merge/concat DataFrame and Series
* read csv use converters
* join list to a DataFrame
* check dataframe for duplicated data

In [5]:
def strip_space(text):
    try:
        return text.strip()
    except AttributeError:
        return text
    

In [6]:
# https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset#movie_metadata.csv

# read a dataset movies
import pandas as pd
movies = pd.read_csv('../csv/movie_metadata.csv', 
                     usecols=['title_year', 'movie_title', 'director_name', 'plot_keywords', 'budget']
                    ,converters = {'movie_title' : strip_space}
                    )
movies.head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
0,James Cameron,Avatar,avatar|future|marine|native|paraplegic,237000000.0,2009.0
1,Gore Verbinski,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,300000000.0,2007.0
2,Sam Mendes,Spectre,bomb|espionage|sequel|spy|terrorist,245000000.0,2015.0
3,Christopher Nolan,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,250000000.0,2012.0
4,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,


In [3]:
movies[movies.movie_title == 'Avatar ']

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year
0,James Cameron,Avatar,avatar|future|marine|native|paraplegic,237000000.0,2009.0


In [7]:
movies.dtypes

director_name     object
movie_title       object
plot_keywords     object
budget           float64
title_year       float64
dtype: object

In [8]:
movies['movie_title'] = movies.movie_title.str.strip()

In [9]:
movies.shape

(5043, 5)

In [10]:
movies[movies.movie_title.duplicated(keep=False)].shape

(247, 5)

In [11]:
movies.drop_duplicates(subset=['movie_title'], keep='first', inplace=True)

In [12]:
movies.shape

(4916, 5)

## add completely new column

In [13]:
import numpy as np
movies['e'] = np.NaN
movies.head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year,e
0,James Cameron,Avatar,avatar|future|marine|native|paraplegic,237000000.0,2009.0,
1,Gore Verbinski,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,300000000.0,2007.0,
2,Sam Mendes,Spectre,bomb|espionage|sequel|spy|terrorist,245000000.0,2015.0,
3,Christopher Nolan,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,250000000.0,2012.0,
4,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,,


In [14]:
movies['f'] = 1
movies.head()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year,e,f
0,James Cameron,Avatar,avatar|future|marine|native|paraplegic,237000000.0,2009.0,,1
1,Gore Verbinski,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,300000000.0,2007.0,,1
2,Sam Mendes,Spectre,bomb|espionage|sequel|spy|terrorist,245000000.0,2015.0,,1
3,Christopher Nolan,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,250000000.0,2012.0,,1
4,Doug Walker,Star Wars: Episode VII - The Force Awakens,,,,,1


## add new column based on existing column

In [15]:
movies['century'] = movies['title_year'] > 2000
movies.tail()

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year,e,f,century
5038,Scott Smith,Signed Sealed Delivered,fraud|postal worker|prison|theft|trial,,2013.0,,1,True
5039,,The Following,cult|fbi|hideout|prison escape|serial killer,,,,1,False
5040,Benjamin Roberds,A Plague So Pleasant,,1400.0,2013.0,,1,True
5041,Daniel Hsia,Shanghai Calling,,,2012.0,,1,True
5042,Jon Gunn,My Date with Drew,actress name in title|crush|date|four word tit...,1100.0,2004.0,,1,True


In [16]:
movies['century'] = movies.century.map({True:'21 Century', False:'20 Century'})
movies.tail(10)

Unnamed: 0,director_name,movie_title,plot_keywords,budget,title_year,e,f,century
5033,Shane Carruth,Primer,changing the future|independent film|invention...,7000.0,2004.0,,1,21 Century
5034,Neill Dela Llana,Cavite,jihad|mindanao|philippines|security guard|squa...,7000.0,2005.0,,1,21 Century
5035,Robert Rodriguez,El Mariachi,assassin|death|guitar|gun|mariachi,7000.0,1992.0,,1,20 Century
5036,Anthony Vallone,The Mongol King,jewell|mongol|nostradamus|stepnicka|vallone,3250.0,2005.0,,1,21 Century
5037,Edward Burns,Newlyweds,written and directed by cast member,9000.0,2011.0,,1,21 Century
5038,Scott Smith,Signed Sealed Delivered,fraud|postal worker|prison|theft|trial,,2013.0,,1,21 Century
5039,,The Following,cult|fbi|hideout|prison escape|serial killer,,,,1,20 Century
5040,Benjamin Roberds,A Plague So Pleasant,,1400.0,2013.0,,1,21 Century
5041,Daniel Hsia,Shanghai Calling,,,2012.0,,1,21 Century
5042,Jon Gunn,My Date with Drew,actress name in title|crush|date|four word tit...,1100.0,2004.0,,1,21 Century


## matching the content of the DataFrame

In [None]:
#movies = movies.set_index('movie_title')

In [17]:
watched = pd.Series([True, True], index=['Avatar', 'Spectre'], name='watched')
watched

Avatar     True
Spectre    True
Name: watched, dtype: bool

In [20]:
watched.shape

(2,)

In [19]:
movies.shape

(4916, 8)

In [24]:
df_concat = pd.concat([movies.set_index('movie_title'), watched.to_frame()], axis=1)
df_concat.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,director_name,plot_keywords,budget,title_year,e,f,century,watched
#Horror,Tara Subkoff,bullying|cyberbullying|girl|internet|throat sl...,1500000.0,2015.0,,1,21 Century,
10 Cloverfield Lane,Dan Trachtenberg,alien|bunker|car crash|kidnapping|minimal cast,15000000.0,2016.0,,1,21 Century,
10 Days in a Madhouse,Timothy Hines,,12000000.0,2015.0,,1,21 Century,
10 Things I Hate About You,Gil Junger,dating|protective father|school|shrew|teen movie,16000000.0,1999.0,,1,20 Century,
"10,000 B.C.",Christopher Barnard,,,,,1,20 Century,


In [22]:
df_concat.watched.value_counts()

True    2
Name: watched, dtype: int64

In [23]:
df_concat[df_concat.watched == True]

Unnamed: 0,director_name,plot_keywords,budget,title_year,e,f,century,watched
Avatar,James Cameron,avatar|future|marine|native|paraplegic,237000000.0,2009.0,,1,21 Century,True
Spectre,Sam Mendes,bomb|espionage|sequel|spy|terrorist,245000000.0,2015.0,,1,21 Century,True
