# News, Trust, and Data Access

## Data Cleaning

This jupyter notebook cleans the coded responses into a tidy dataset for analysis and visualization.

In [133]:
# Import needed packages
import pandas as pd
import wrangle

# Read in coded responses
survey = pd.read_csv('data/all_responses_coded.csv', index_col='index')

# View head of imported data
survey.head()

Unnamed: 0_level_0,RespondentID,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A55,A56,A57,A58,A59,A60,A61,A62,StartDate,EndDate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6176264298,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,5/1/17 15:41,5/1/17 15:43
1,6176263960,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,5/1/17 15:35,5/1/17 15:43
2,6176258621,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,5/1/17 15:38,5/1/17 15:40
3,6176257082,0,0,0,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,5/1/17 15:38,5/1/17 15:39
4,6176256111,0,0,0,1,0,0,1,0,0,...,0,1,1,0,0,0,0,0,5/1/17 15:34,5/1/17 15:39


In [134]:
col_names, questions = wrangle.col_names, wrangle.questions

# rename columns
survey.rename(columns=col_names, inplace=True)

# group responses by questions into new features
for q in questions:
    survey[q] = survey.loc[:,questions[q]].idxmax(1)
    survey.drop(columns=questions[q], inplace=True)

# Drop unwanted columns
survey.drop(
    columns=['Trump_Approval', 'Other', 'Region', 'Device', 'StartDate', 'EndDate'],
    inplace=True
)

In [135]:
survey.isnull().sum().sum()

0

In [136]:
# Drop null values (if needed)
if survey.isna():
    survey.dropna(inplace=True)

### Melt into Tidy dataset

In [137]:
tidy_survey = survey.melt(
    id_vars=['RespondentID', 'Political_View', 'General_Trust', 'Pay_For_News', 'Data_Access', 'Age', 'Gender', 'Income'], 
    var_name='NewsSource', 
    value_name='SourceTrust'
)
tidy_survey.head()

Unnamed: 0,RespondentID,Political_View,General_Trust,Pay_For_News,Data_Access,Age,Gender,Income,NewsSource,SourceTrust
0,6176264298,Moderate,Fair amount,No,No Change,30-44,Male,"10,000-24,999",NYT,1
1,6176263960,Moderate,Fair amount,No,Increase Trust,18-29,Female,"0-9,999",NYT,1
2,6176258621,Liberal,Fair amount,Yes,Increase Trust,30-44,Male,"125,000-149,999",NYT,1
3,6176257082,Liberal,Fair amount,No,Increase Trust,18-29,Male,"125,000-149,999",NYT,0
4,6176256111,Liberal,Fair amount,Yes,No Change,30-44,Male,"10,000-24,999",NYT,1


## Save cleaned data to new CSV

In [138]:
# Save the dataset to __clean csv file
tidy_survey.to_csv('data/news_trust_data__clean.csv', index=None)