In [2]:
import pandas as pd

### Open Data

In [157]:
# open new scraped comments

new_comments = pd.read_csv("./data/reddit/DCforRent_12_5_comments.csv")
new_comments.head()

Unnamed: 0,comment_id,submission_id,author,body,score,year,month
0,isl1exn,y57cbw,eeek0711,Or Cleveland Park in DC,4,2022,10
1,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10
2,islh5rt,y57cbw,nonmimeticform,Baltimore,3,2022,10
3,isuelis,y57cbw,dcgirlsmallworld,If you are looking for a quiet neighborhood wi...,3,2022,10
4,isl1grp,y57cbw,eeek0711,I lived in the DeLano Apartments in Woodley Pa...,2,2022,10


In [158]:
# open old scraped comments
coded_comments = pd.read_csv("./data/reddit/DCforRent_comments_CODED_SET.csv",usecols=["submission_id","author","body","score","year","month","coded"])
coded_comments.head()

Unnamed: 0,submission_id,author,body,score,year,month,coded
0,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0
1,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,1
2,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,1
3,17ed6rg,mm10102,Woodley park and Cleveland park have one beds ...,1,2023,10,1
4,17ed6rg,Musictravels23,Hey there!! I have a fuIIy furnished sunny spa...,1,2023,10,0


In [159]:
len(coded_comments)

28

### Merge 

In [160]:
coded_with_id = pd.merge(new_comments, coded_comments, on=['submission_id','author','body',"year","month"],how='inner')

In [161]:
len(coded_with_id)

25

In [162]:
coded_with_id = coded_with_id.drop("score_y",axis=1)

In [163]:
coded_with_id = coded_with_id.rename(columns={"score_x":"score"})

In [164]:
coded_with_id

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,coded
0,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0
1,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,1
2,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,1
3,k7632z8,17ed6rg,Musictravels23,Hey there!! I have a fuIIy furnished sunny spa...,1,2023,10,0
4,io5bcjq,xciciq,cheesyuser,Thats a difficult ask. The 2000 range may be d...,2,2022,9,0
5,io6c09j,xciciq,,[deleted],3,2022,9,0
6,jlfzfln,13qokxh,myloversacarnivore,Neighborhood safety is a separate concern from...,6,2023,5,1
7,ibf4in0,v6dd5a,Competitive_Green_23,I have a house by there for rent minus 1 bathr...,0,2022,6,0
8,ibpkvnv,v6dd5a,Competitive_Green_23,Excellent best of luck to both of you,2,2022,6,0
9,gcswkrv,jwtuba,bleucheeez,Look for the housing facebook groups. There ar...,1,2020,11,0


In [166]:
len(coded_comments) - len(coded_with_id) # we lost 3 coded comments

3

### Identify the comments that did not join

In [167]:
test = coded_with_id.merge(coded_comments, on='body', how='outer', indicator=True)

lost_comments = test[test["_merge"] == "right_only"]

lost_comments.head()

Unnamed: 0,comment_id,submission_id_x,author_x,body,score_x,year_x,month_x,coded_x,submission_id_y,author_y,score_y,year_y,month_y,coded_y,_merge
3,,,,Crime. I‚Äôve heard of tenants leaving mid-lea...,,,,,v7mf0p,bwood07,6,2022,6,1,right_only
14,,,,I moved to DC back in March and wasn‚Äôt very ...,,,,,vg2hi9,PeterBeHangin,3,2022,6,1,right_only
24,,,,Woodley park and Cleveland park have one beds ...,,,,,17ed6rg,mm10102,1,2023,10,1,right_only


In [168]:
lost_comments[["body","submission_id_y","author_y","score_y","year_y","month_y","coded_y"]]

Unnamed: 0,body,submission_id_y,author_y,score_y,year_y,month_y,coded_y
3,Crime. I‚Äôve heard of tenants leaving mid-lea...,v7mf0p,bwood07,6,2022,6,1
14,I moved to DC back in March and wasn‚Äôt very ...,vg2hi9,PeterBeHangin,3,2022,6,1
24,Woodley park and Cleveland park have one beds ...,17ed6rg,mm10102,1,2023,10,1


In [180]:
comment_found = new_comments[(new_comments["author"] == lost_comments.iloc[2]["author_y"]) & (new_comments["submission_id"] == lost_comments.iloc[2]["submission_id_y"])]

comment_found

Unnamed: 0,comment_id,submission_id,author,body,score,year,month
38,k6rdhmy,17ed6rg,mm10102,Woodley park and Cleveland park have one beds ...,1,2023,10


In [147]:
comment_found = comment_found.loc[[1574]]

comment_found

Unnamed: 0,comment_id,submission_id,author,body,score,year,month
1574,lbwsjtm,1dwhdrj,DC_Tribalist,"Yep. DuPont and Logan have a few panhandlers, ...",3,2024,7


In [181]:
comment_found["coded"] = lost_comments.iloc[2]["coded_y"]

comment_found

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_found["coded"] = lost_comments.iloc[2]["coded_y"]


Unnamed: 0,comment_id,submission_id,author,body,score,year,month,coded
38,k6rdhmy,17ed6rg,mm10102,Woodley park and Cleveland park have one beds ...,1,2023,10,1


In [182]:
coded_with_id = pd.concat([coded_with_id, comment_found])

In [183]:
coded_with_id

Unnamed: 0,comment_id,submission_id,author,body,score,year,month,coded
0,isl1dig,y57cbw,eeek0711,Takoma Park MD,3,2022,10,0
1,i9cnmct,utr8aw,dans_cafe,"Generally speaking, robberies and muggings are...",1,2022,5,1
2,i9csov5,utr8aw,Ok_Priority_1534,Downtown/Central DC.\n\nAs I'm looking at diff...,1,2022,5,1
3,k7632z8,17ed6rg,Musictravels23,Hey there!! I have a fuIIy furnished sunny spa...,1,2023,10,0
4,io5bcjq,xciciq,cheesyuser,Thats a difficult ask. The 2000 range may be d...,2,2022,9,0
5,io6c09j,xciciq,,[deleted],3,2022,9,0
6,jlfzfln,13qokxh,myloversacarnivore,Neighborhood safety is a separate concern from...,6,2023,5,1
7,ibf4in0,v6dd5a,Competitive_Green_23,I have a house by there for rent minus 1 bathr...,0,2022,6,0
8,ibpkvnv,v6dd5a,Competitive_Green_23,Excellent best of luck to both of you,2,2022,6,0
9,gcswkrv,jwtuba,bleucheeez,Look for the housing facebook groups. There ar...,1,2020,11,0


In [184]:
len(coded_with_id)

28

In [185]:
coded_with_id.to_csv("./data/reddit/DCforRent_comments_with_commentid_CODED_SET.csv")