In [1]:
def read_tsv(tar, fname):
    member = tar.getmember(fname)
    print(member.name)
    tf = tar.extractfile(member)
    data = []
    labels = []
    for line in tf:
        line = line.decode("utf-8")
        (label,text) = line.strip().split("\t")
        labels.append(label)
        data.append(text)
    return data, labels

In [2]:
def read_files(tarfname):
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    
    trainname = "train.tsv"
    devname = "dev.tsv"
    unlabeledname = "unlabeled.tsv"
    for member in tar.getmembers():
        if 'train.tsv' in member.name:
            trainname = member.name
        elif 'dev.tsv' in member.name:
            devname = member.name
        elif 'unlabeled.tsv' in member.name:
            unlabeledname = member.name

    # no sentiment? from sentiment import read_tsv            
    class Data: pass
    sentiment = Data()
    sentiment.train_data, sentiment.train_labels = read_tsv(tar,trainname)
    print(f"train data num: {len(sentiment.train_data)}")
    sentiment.dev_data, sentiment.dev_labels = read_tsv(tar, devname)
    print(f"dev data num: {len(sentiment.dev_data)}")
    sentiment.test_data = []
    tf = tar.extractfile(unlabeledname)
    for line in tf:
        line = line.decode("utf-8")
        text = line.strip()
        sentiment.test_data.append(text)
    print(f"test data num: {len(sentiment.test_data)}")

    tar.close()
    return sentiment

In [3]:
def analyize(sentiment):
    import nltk
    sentiment.train_data_divided = [[], []]
    for index, label in enumerate(sentiment.train_labels):
        sentiment.train_data_divided[int(label == 'NEGATIVE')].append(sentiment.train_data[index])
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    stopwords = set(nltk.corpus.stopwords.words('english')) 
    tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "RP", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    vocabularies = [{}, {}]
    for index, data in enumerate(sentiment.train_data_divided):
        for document in data:
            token_tags = nltk.pos_tag(tokenizer.tokenize(document))
            for token, tag in token_tags:
                if token not in stopwords and tag in tags and len(token) > 1:
                    if token in vocabularies[index]:
                        vocabularies[index][token] += 1
                    else:
                        vocabularies[index][token] = 1
    
    print(sorted(vocabularies[0].items(), key=lambda x: x[1], reverse=True))
    print()
    print(sorted(vocabularies[1].items(), key=lambda x: x[1], reverse=True))

In [4]:
def transformX(sentiment, ngram_range=(1,4), min_df=2):
    from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
    from nltk import word_tokenize
    sentiment.count_vect = CountVectorizer(tokenizer=word_tokenize, ngram_range=ngram_range, min_df=min_df)
    
    sentiment.trainX = sentiment.count_vect.fit_transform(sentiment.train_data)
    sentiment.devX = sentiment.count_vect.transform(sentiment.dev_data)
    sentiment.testX = sentiment.count_vect.transform(sentiment.test_data)
    print(f"feature num: {sentiment.trainX.shape[1]}")

    return sentiment

def transformY(sentiment):
    from sklearn import preprocessing
    sentiment.le = preprocessing.LabelEncoder()
    
    sentiment.le.fit(sentiment.train_labels)
    sentiment.target_labels = sentiment.le.classes_
    sentiment.trainy = sentiment.le.transform(sentiment.train_labels)
    sentiment.devy = sentiment.le.transform(sentiment.dev_labels)
    
    return sentiment

In [5]:
def train_classifier(X, y):
    """Train a classifier using the given training data.

    Trains logistic regression on the input data with default parameters.
    """
    from sklearn.linear_model import LogisticRegression
    cls = LogisticRegression(C=0.8, random_state=0, solver='sag', max_iter=10000)
    cls.fit(X, y)
    return cls

def evaluate(X, yt, cls, name='data'):
    """Evaluated a classifier on the given labeled data using accuracy."""
    from sklearn import metrics
    yp = cls.predict(X)
    acc = metrics.accuracy_score(yt, yp)
    print("Accuracy on %s  is: %s" % (name, acc))
    return acc

In [6]:
# print("Reading data")
# tarfname = "data/sentiment.tar.gz"
# sentiment = read_files(tarfname)
# sentiment = transformY(sentiment)
# print()

# for max_ngram in range(2, 6):
#     print(f'max_ngram: {max_ngram}')
#     for min_df in range(1, 4):
#         print(f'min_df: {min_df}')
#         sentiment = transformX(sentiment, ngram_range=(1, max_ngram), min_df=min_df)
        
#         cls = train_classifier(sentiment.trainX, sentiment.trainy)
#         evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
#         evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
#     print()

In [7]:
import subprocess
subprocess.run(['pwd'], shell=True)
print(subprocess.Popen("pwd", shell=True, stdout=subprocess.PIPE).stdout.read())

b'/kaggle/working\n'


In [8]:
tarfname = "../input/sentiment/sentiment"
#tarfname = "../input/sentiment-pred.csv"
sentiment = read_files(tarfname)

import os 
import pandas as pd
os.path.isfile(tarfname)
os.listdir(tarfname)

IsADirectoryError: [Errno 21] Is a directory: '../input/sentiment/sentiment'

In [9]:
sentiment = transformX(sentiment)
sentiment = transformY(sentiment)
    
max_accuracy = 0.0
while True:
    cls = train_classifier(sentiment.trainX, sentiment.trainy)
    
    # Stop criterion
    accuracy = evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
     if accuracy >= max_accuracy:
         max_accuracy = accuracy
     else:
         break;

IndentationError: unexpected indent (<ipython-input-9-735e1aa5e2eb>, line 10)

In [10]:
# Expand the training set
labels = sentiment.le.inverse_transform(cls.predict(sentiment.testX))
scores = cls.decision_function(sentiment.testX)
for index, score in enumerate(scores):
    if score > 5 or score < -5:
        sentiment.train_data.append(sentiment.test_data[index])
        sentiment.train_labels.append(labels[index])
sentiment.trainX = sentiment.count_vect.transform(sentiment.train_data)
sentiment.trainy = sentiment.le.transform(sentiment.train_labels)

NameError: name 'sentiment' is not defined

In [11]:
# if __name__ == "__main__":
#    print("Reading data")
#    tarfname = "../input/sentiment/sentiment"
#    sentiment = read_files(tarfname)
#    print("\nTraining classifier")
#    while True: 
#        import classify
#        cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
#    print("\nEvaluating")
#    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
#    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

In [12]:
""""{python}
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.
key_1.csv
sample_submission_1.csv
train_1.csv

sample_sub = "../input/sample_submission_1.csv"

data = pd.read_csv(sample_sub)
data['Visits'] = data['Visits'] + 32
data.to_csv("32.csv", index=False)
```


``` 
citi <- income[1:68,]
macroeco <- merge(macroeco, citi[, c("date", "provision.losses")], by = "date", 
                  all.x = TRUE, all.y = TRUE)
pairs(~provision.losses + real.gdp.growth + nominal.gdp.growth + real.disposable.income.growth + 
        nominal.disposable.income.growth + unemployment.rate + cpi.inflation.rate + three + five 
      + ten + bbb.corporate.yield + mortgage.rate + prime.rate
      + dow.jones.total.stock.market.index.level + house.price.index.level
      + commercial.real.estate.price.index.level + market.volatility.index.level
      , data = macroeco, main = "Citigroup Loss Scatterplot")
cor(macroeco[, -c(1,17)], use = "complete.obs")
```

Run Time-Series Regression 
``` {r}
macroeco$date <- ts(macroeco$date)
citireg <- tslm(provision.losses ~ real.gdp.growth + unemployment.rate + three
             + dow.jones.total.stock.market.index.level + prime.rate, data = macroeco)
summary(citireg)
"""

'"{python}\nfrom subprocess import check_output\nprint(check_output(["ls", "../input"]).decode("utf8"))\n\n# Any results you write to the current directory are saved as output.\nkey_1.csv\nsample_submission_1.csv\ntrain_1.csv\n\nsample_sub = "../input/sample_submission_1.csv"\n\ndata = pd.read_csv(sample_sub)\ndata[\'Visits\'] = data[\'Visits\'] + 32\ndata.to_csv("32.csv", index=False)\n```\n\n\n``` \nciti <- income[1:68,]\nmacroeco <- merge(macroeco, citi[, c("date", "provision.losses")], by = "date", \n                  all.x = TRUE, all.y = TRUE)\npairs(~provision.losses + real.gdp.growth + nominal.gdp.growth + real.disposable.income.growth + \n        nominal.disposable.income.growth + unemployment.rate + cpi.inflation.rate + three + five \n      + ten + bbb.corporate.yield + mortgage.rate + prime.rate\n      + dow.jones.total.stock.market.index.level + house.price.index.level\n      + commercial.real.estate.price.index.level + market.volatility.index.level\n      , data = macroeco