In [17]:
import pandas
import numpy
import matplotlib as plt

from sklearn import linear_model, feature_extraction

def categorical_features(row):
    d = {}
    d["STATE"] = row[1]["STATE"]
    return d
#feature
def last_poll(full_data):
    """
    Create feature from last poll in each state
    """
    #adding topic
    
    # Only care about republicans
    repub = full_data[full_data["PARTY"] == "Rep"]
    repub = full_data[full_data["CHOICE"]!="Undecided"]
    
    # Sort by date
    chron = repub.sort_values(by="DATE", ascending=True)

    # Only keep the last one
    dedupe = chron.drop_duplicates(subset="STATE", keep="last")

    # Remove national polls
    return dedupe[dedupe["STATE"] != "US"]
    
if __name__ == "__main__":
    # Read in the X data
    all_data = pandas.read_csv("data.csv")

    # Remove non-states
    all_data = all_data[pandas.notnull(all_data["STATE"])]

    # split between testing and training
    train_x = last_poll(all_data[all_data["TOPIC"] == '2012-president'])

    test_x=train_x.tail(10)
    train_x=train_x.head(40)
    train_x.set_index("STATE")
    test_x.set_index("STATE")

    # Read in the Y data
    y_data = pandas.read_csv("../data/2012_pres.csv", sep=';')
    y_data = y_data[y_data["PARTY"] == "R"]
    y_data = y_data[pandas.notnull(y_data["GENERAL %"])]
    y_data["GENERAL %"] = [float(x.replace(",", ".").replace("%", ""))
                           for x in y_data["GENERAL %"]]
    y_data["STATE"] = y_data["STATE ABBREVIATION"]
    y_data.set_index("STATE")

    backup = train_x
    train_x = y_data.merge(train_x, on="STATE",how='left')
    
    # make sure we have all states in the test data
    for ii in set(y_data.STATE) - set(test_x.STATE):
        new_row = pandas.DataFrame([{"STATE": ii}])
        test_x = test_x.append(new_row)

    # format the data for regression
    train_x = pandas.concat([train_x.STATE.astype(str).str.get_dummies(),
                             train_x], axis=1)
    test_x = pandas.concat([test_x.STATE.astype(str).str.get_dummies(),
                             test_x], axis=1)
        
    # handle missing data
    for dd in train_x, test_x:                
        dd["NOPOLL"] = pandas.isnull(dd["VALUE"])
        dd["VALUE"] = dd["VALUE"].fillna(0.0)
        
    # create feature list
    features = list(y_data.STATE)
    features.append("VALUE")
    features.append("NOPOLL")    
        
    # fit the regression
    mod = linear_model.LinearRegression()
    mod.fit(train_x[features], train_x["GENERAL %"])

    # Write out the model
    with open("model.txt", 'w') as out:
        out.write("BIAS\t%f\n" % mod.intercept_)
        for jj, kk in zip(features, mod.coef_):
            out.write("%s\t%f\n" % (jj, kk))
    
    # Write the predictions
    pred_test = mod.predict(test_x[features])
    with open("pred.txt", 'w') as out:
        for ss, vv in sorted(zip(list(test_x.STATE), pred_test)):
            out.write("%s\t%f\n" % (ss, vv))


In [18]:
print("Mean squared error: %.2f"
      % numpy.mean((mod.predict(test_x[features]) - train_x["GENERAL %"]) ** 2))
#test data - train data
#(prediction - observed)^2

Mean squared error: 233.46


In [19]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % mod.score(test_x[features],train_x["GENERAL %"]))

Variance score: -0.70


In [20]:
all_data


Unnamed: 0,YEAR,DATE,TOPIC,NAME,MOE,SUBPOP,SUBPOPID,CHOICE,PARTY,VALUE,OBS,STATE
0,2012,2012-10-14,2012-president,2012 Washington DC President: Romney vs. Obama,2.80,Likely Voters,1,Romney,Rep,8.0,1222.0,DC
1,2012,2012-10-14,2012-president,2012 Washington DC President: Romney vs. Obama,2.80,Likely Voters,1,Obama,Dem,88.0,1222.0,DC
2,2012,2012-10-10,2012-president,2012 Idaho President: Romney vs. Obama,4.00,Likely Voters,1,Romney,Rep,63.0,625.0,ID
3,2012,2012-10-10,2012-president,2012 Idaho President: Romney vs. Obama,4.00,Likely Voters,1,Obama,Dem,27.0,625.0,ID
4,2012,2012-10-10,2012-president,2012 Idaho President: Romney vs. Obama,4.00,Likely Voters,1,Undecided,,6.0,625.0,ID
5,2012,2012-10-10,2012-president,2012 Idaho President: Romney vs. Obama,4.00,Likely Voters,1,Other,,1.0,625.0,ID
6,2012,2012-10-10,2012-president,2012 Idaho President: Romney vs. Obama,4.00,Likely Voters,1,Johnson,,3.0,625.0,ID
7,2012,2012-10-27,2012-president,2012 Rhode Island President: Romney vs. Obama,4.00,Likely Voters,1,Undecided,,8.0,601.0,RI
8,2012,2012-10-27,2012-president,2012 Rhode Island President: Romney vs. Obama,4.00,Likely Voters,1,Obama,Dem,54.0,601.0,RI
9,2012,2012-10-27,2012-president,2012 Rhode Island President: Romney vs. Obama,4.00,Likely Voters,1,Romney,Rep,33.0,601.0,RI


In [32]:
if (all_data.TOPIC=="obama-job-approval"):
    print(all_data.TOPIC)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().